mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-28 01:21:59 +00:00
Merge branch 'main' into feat/litellm_sambanova_usage
This commit is contained in:
commit
b7f16ac7a6
535 changed files with 23539 additions and 8112 deletions
6
docs/_static/css/my_theme.css
vendored
6
docs/_static/css/my_theme.css
vendored
|
|
@ -27,3 +27,9 @@ pre {
|
|||
white-space: pre-wrap !important;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
[data-theme="dark"] .mermaid {
|
||||
background-color: #f4f4f6 !important;
|
||||
border-radius: 6px;
|
||||
padding: 0.5em;
|
||||
}
|
||||
|
|
|
|||
29
docs/_static/js/detect_theme.js
vendored
29
docs/_static/js/detect_theme.js
vendored
|
|
@ -1,9 +1,32 @@
|
|||
document.addEventListener("DOMContentLoaded", function () {
|
||||
const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
|
||||
const htmlElement = document.documentElement;
|
||||
if (prefersDark) {
|
||||
htmlElement.setAttribute("data-theme", "dark");
|
||||
|
||||
// Check if theme is saved in localStorage
|
||||
const savedTheme = localStorage.getItem("sphinx-rtd-theme");
|
||||
|
||||
if (savedTheme) {
|
||||
// Use the saved theme preference
|
||||
htmlElement.setAttribute("data-theme", savedTheme);
|
||||
document.body.classList.toggle("dark", savedTheme === "dark");
|
||||
} else {
|
||||
htmlElement.setAttribute("data-theme", "light");
|
||||
// Fall back to system preference
|
||||
const theme = prefersDark ? "dark" : "light";
|
||||
htmlElement.setAttribute("data-theme", theme);
|
||||
document.body.classList.toggle("dark", theme === "dark");
|
||||
// Save initial preference
|
||||
localStorage.setItem("sphinx-rtd-theme", theme);
|
||||
}
|
||||
|
||||
// Listen for theme changes from the existing toggle
|
||||
const observer = new MutationObserver(function(mutations) {
|
||||
mutations.forEach(function(mutation) {
|
||||
if (mutation.attributeName === "data-theme") {
|
||||
const currentTheme = htmlElement.getAttribute("data-theme");
|
||||
localStorage.setItem("sphinx-rtd-theme", currentTheme);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
observer.observe(htmlElement, { attributes: true });
|
||||
});
|
||||
|
|
|
|||
537
docs/_static/llama-stack-spec.html
vendored
537
docs/_static/llama-stack-spec.html
vendored
|
|
@ -497,6 +497,54 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/openai/v1/responses": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Runtime representation of an annotated type.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseObject"
|
||||
}
|
||||
},
|
||||
"text/event-stream": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseObjectStream"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Agents"
|
||||
],
|
||||
"description": "Create a new OpenAI response.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/CreateOpenaiResponseRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/files": {
|
||||
"get": {
|
||||
"responses": {
|
||||
|
|
@ -1278,6 +1326,49 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"/v1/openai/v1/responses/{id}": {
|
||||
"get": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "An OpenAIResponseObject.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseObject"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Agents"
|
||||
],
|
||||
"description": "Retrieve an OpenAI response by its ID.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "id",
|
||||
"in": "path",
|
||||
"description": "The ID of the OpenAI response to retrieve.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"/v1/scoring-functions/{scoring_fn_id}": {
|
||||
"get": {
|
||||
"responses": {
|
||||
|
|
@ -5221,17 +5312,25 @@
|
|||
"default": 10
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"description": "The model identifier to use for the agent"
|
||||
},
|
||||
"instructions": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"description": "The system instructions for the agent"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Optional name for the agent, used in telemetry and identification"
|
||||
},
|
||||
"enable_session_persistence": {
|
||||
"type": "boolean",
|
||||
"default": false
|
||||
"default": false,
|
||||
"description": "Optional flag indicating whether session data has to be persisted"
|
||||
},
|
||||
"response_format": {
|
||||
"$ref": "#/components/schemas/ResponseFormat"
|
||||
"$ref": "#/components/schemas/ResponseFormat",
|
||||
"description": "Optional response format configuration"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
@ -5239,7 +5338,8 @@
|
|||
"model",
|
||||
"instructions"
|
||||
],
|
||||
"title": "AgentConfig"
|
||||
"title": "AgentConfig",
|
||||
"description": "Configuration for an agent."
|
||||
},
|
||||
"AgentTool": {
|
||||
"oneOf": [
|
||||
|
|
@ -6183,6 +6283,430 @@
|
|||
],
|
||||
"title": "AgentTurnResponseTurnStartPayload"
|
||||
},
|
||||
"OpenAIResponseInputMessage": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"content": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseInputMessageContent"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"role": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "string",
|
||||
"const": "system"
|
||||
},
|
||||
{
|
||||
"type": "string",
|
||||
"const": "developer"
|
||||
},
|
||||
{
|
||||
"type": "string",
|
||||
"const": "user"
|
||||
},
|
||||
{
|
||||
"type": "string",
|
||||
"const": "assistant"
|
||||
}
|
||||
]
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"const": "message",
|
||||
"default": "message"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"content",
|
||||
"role"
|
||||
],
|
||||
"title": "OpenAIResponseInputMessage"
|
||||
},
|
||||
"OpenAIResponseInputMessageContent": {
|
||||
"oneOf": [
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIResponseInputMessageContentText"
|
||||
},
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage"
|
||||
}
|
||||
],
|
||||
"discriminator": {
|
||||
"propertyName": "type",
|
||||
"mapping": {
|
||||
"input_text": "#/components/schemas/OpenAIResponseInputMessageContentText",
|
||||
"input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage"
|
||||
}
|
||||
}
|
||||
},
|
||||
"OpenAIResponseInputMessageContentImage": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"detail": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "string",
|
||||
"const": "low"
|
||||
},
|
||||
{
|
||||
"type": "string",
|
||||
"const": "high"
|
||||
},
|
||||
{
|
||||
"type": "string",
|
||||
"const": "auto"
|
||||
}
|
||||
],
|
||||
"default": "auto"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"const": "input_image",
|
||||
"default": "input_image"
|
||||
},
|
||||
"image_url": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"detail",
|
||||
"type"
|
||||
],
|
||||
"title": "OpenAIResponseInputMessageContentImage"
|
||||
},
|
||||
"OpenAIResponseInputMessageContentText": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"const": "input_text",
|
||||
"default": "input_text"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"text",
|
||||
"type"
|
||||
],
|
||||
"title": "OpenAIResponseInputMessageContentText"
|
||||
},
|
||||
"OpenAIResponseInputTool": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "string",
|
||||
"const": "web_search"
|
||||
},
|
||||
{
|
||||
"type": "string",
|
||||
"const": "web_search_preview_2025_03_11"
|
||||
}
|
||||
],
|
||||
"default": "web_search"
|
||||
},
|
||||
"search_context_size": {
|
||||
"type": "string",
|
||||
"default": "medium"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"type"
|
||||
],
|
||||
"title": "OpenAIResponseInputToolWebSearch"
|
||||
},
|
||||
"CreateOpenaiResponseRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"input": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseInputMessage"
|
||||
}
|
||||
}
|
||||
],
|
||||
"description": "Input message(s) to create the response."
|
||||
},
|
||||
"model": {
|
||||
"type": "string",
|
||||
"description": "The underlying LLM used for completions."
|
||||
},
|
||||
"previous_response_id": {
|
||||
"type": "string",
|
||||
"description": "(Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses."
|
||||
},
|
||||
"store": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"stream": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"temperature": {
|
||||
"type": "number"
|
||||
},
|
||||
"tools": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseInputTool"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"input",
|
||||
"model"
|
||||
],
|
||||
"title": "CreateOpenaiResponseRequest"
|
||||
},
|
||||
"OpenAIResponseError": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"code": {
|
||||
"type": "string"
|
||||
},
|
||||
"message": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"code",
|
||||
"message"
|
||||
],
|
||||
"title": "OpenAIResponseError"
|
||||
},
|
||||
"OpenAIResponseObject": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"created_at": {
|
||||
"type": "integer"
|
||||
},
|
||||
"error": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseError"
|
||||
},
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
},
|
||||
"object": {
|
||||
"type": "string",
|
||||
"const": "response",
|
||||
"default": "response"
|
||||
},
|
||||
"output": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseOutput"
|
||||
}
|
||||
},
|
||||
"parallel_tool_calls": {
|
||||
"type": "boolean",
|
||||
"default": false
|
||||
},
|
||||
"previous_response_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"temperature": {
|
||||
"type": "number"
|
||||
},
|
||||
"top_p": {
|
||||
"type": "number"
|
||||
},
|
||||
"truncation": {
|
||||
"type": "string"
|
||||
},
|
||||
"user": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"created_at",
|
||||
"id",
|
||||
"model",
|
||||
"object",
|
||||
"output",
|
||||
"parallel_tool_calls",
|
||||
"status"
|
||||
],
|
||||
"title": "OpenAIResponseObject"
|
||||
},
|
||||
"OpenAIResponseOutput": {
|
||||
"oneOf": [
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIResponseOutputMessage"
|
||||
},
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
|
||||
}
|
||||
],
|
||||
"discriminator": {
|
||||
"propertyName": "type",
|
||||
"mapping": {
|
||||
"message": "#/components/schemas/OpenAIResponseOutputMessage",
|
||||
"web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
|
||||
}
|
||||
}
|
||||
},
|
||||
"OpenAIResponseOutputMessage": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseOutputMessageContent"
|
||||
}
|
||||
},
|
||||
"role": {
|
||||
"type": "string",
|
||||
"const": "assistant",
|
||||
"default": "assistant"
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"const": "message",
|
||||
"default": "message"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"id",
|
||||
"content",
|
||||
"role",
|
||||
"status",
|
||||
"type"
|
||||
],
|
||||
"title": "OpenAIResponseOutputMessage"
|
||||
},
|
||||
"OpenAIResponseOutputMessageContent": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"const": "output_text",
|
||||
"default": "output_text"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"text",
|
||||
"type"
|
||||
],
|
||||
"title": "OpenAIResponseOutputMessageContentOutputText"
|
||||
},
|
||||
"OpenAIResponseOutputMessageWebSearchToolCall": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"const": "web_search_call",
|
||||
"default": "web_search_call"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"id",
|
||||
"status",
|
||||
"type"
|
||||
],
|
||||
"title": "OpenAIResponseOutputMessageWebSearchToolCall"
|
||||
},
|
||||
"OpenAIResponseObjectStream": {
|
||||
"oneOf": [
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated"
|
||||
},
|
||||
{
|
||||
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
||||
}
|
||||
],
|
||||
"discriminator": {
|
||||
"propertyName": "type",
|
||||
"mapping": {
|
||||
"response.created": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated",
|
||||
"response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
||||
}
|
||||
}
|
||||
},
|
||||
"OpenAIResponseObjectStreamResponseCompleted": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"response": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseObject"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"const": "response.completed",
|
||||
"default": "response.completed"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"response",
|
||||
"type"
|
||||
],
|
||||
"title": "OpenAIResponseObjectStreamResponseCompleted"
|
||||
},
|
||||
"OpenAIResponseObjectStreamResponseCreated": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"response": {
|
||||
"$ref": "#/components/schemas/OpenAIResponseObject"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"const": "response.created",
|
||||
"default": "response.created"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"response",
|
||||
"type"
|
||||
],
|
||||
"title": "OpenAIResponseObjectStreamResponseCreated"
|
||||
},
|
||||
"CreateUploadSessionRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -8891,8 +9415,7 @@
|
|||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"role",
|
||||
"content"
|
||||
"role"
|
||||
],
|
||||
"title": "OpenAIAssistantMessageParam",
|
||||
"description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."
|
||||
|
|
|
|||
364
docs/_static/llama-stack-spec.yaml
vendored
364
docs/_static/llama-stack-spec.yaml
vendored
|
|
@ -330,6 +330,39 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/CreateAgentTurnRequest'
|
||||
required: true
|
||||
/v1/openai/v1/responses:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
Runtime representation of an annotated type.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/OpenAIResponseObject'
|
||||
text/event-stream:
|
||||
schema:
|
||||
$ref: '#/components/schemas/OpenAIResponseObjectStream'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Agents
|
||||
description: Create a new OpenAI response.
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/CreateOpenaiResponseRequest'
|
||||
required: true
|
||||
/v1/files:
|
||||
get:
|
||||
responses:
|
||||
|
|
@ -875,6 +908,36 @@ paths:
|
|||
required: true
|
||||
schema:
|
||||
type: string
|
||||
/v1/openai/v1/responses/{id}:
|
||||
get:
|
||||
responses:
|
||||
'200':
|
||||
description: An OpenAIResponseObject.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/OpenAIResponseObject'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Agents
|
||||
description: Retrieve an OpenAI response by its ID.
|
||||
parameters:
|
||||
- name: id
|
||||
in: path
|
||||
description: >-
|
||||
The ID of the OpenAI response to retrieve.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
/v1/scoring-functions/{scoring_fn_id}:
|
||||
get:
|
||||
responses:
|
||||
|
|
@ -3686,18 +3749,29 @@ components:
|
|||
default: 10
|
||||
model:
|
||||
type: string
|
||||
description: >-
|
||||
The model identifier to use for the agent
|
||||
instructions:
|
||||
type: string
|
||||
description: The system instructions for the agent
|
||||
name:
|
||||
type: string
|
||||
description: >-
|
||||
Optional name for the agent, used in telemetry and identification
|
||||
enable_session_persistence:
|
||||
type: boolean
|
||||
default: false
|
||||
description: >-
|
||||
Optional flag indicating whether session data has to be persisted
|
||||
response_format:
|
||||
$ref: '#/components/schemas/ResponseFormat'
|
||||
description: Optional response format configuration
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model
|
||||
- instructions
|
||||
title: AgentConfig
|
||||
description: Configuration for an agent.
|
||||
AgentTool:
|
||||
oneOf:
|
||||
- type: string
|
||||
|
|
@ -4318,6 +4392,295 @@ components:
|
|||
- event_type
|
||||
- turn_id
|
||||
title: AgentTurnResponseTurnStartPayload
|
||||
OpenAIResponseInputMessage:
|
||||
type: object
|
||||
properties:
|
||||
content:
|
||||
oneOf:
|
||||
- type: string
|
||||
- type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/OpenAIResponseInputMessageContent'
|
||||
role:
|
||||
oneOf:
|
||||
- type: string
|
||||
const: system
|
||||
- type: string
|
||||
const: developer
|
||||
- type: string
|
||||
const: user
|
||||
- type: string
|
||||
const: assistant
|
||||
type:
|
||||
type: string
|
||||
const: message
|
||||
default: message
|
||||
additionalProperties: false
|
||||
required:
|
||||
- content
|
||||
- role
|
||||
title: OpenAIResponseInputMessage
|
||||
OpenAIResponseInputMessageContent:
|
||||
oneOf:
|
||||
- $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
|
||||
- $ref: '#/components/schemas/OpenAIResponseInputMessageContentImage'
|
||||
discriminator:
|
||||
propertyName: type
|
||||
mapping:
|
||||
input_text: '#/components/schemas/OpenAIResponseInputMessageContentText'
|
||||
input_image: '#/components/schemas/OpenAIResponseInputMessageContentImage'
|
||||
OpenAIResponseInputMessageContentImage:
|
||||
type: object
|
||||
properties:
|
||||
detail:
|
||||
oneOf:
|
||||
- type: string
|
||||
const: low
|
||||
- type: string
|
||||
const: high
|
||||
- type: string
|
||||
const: auto
|
||||
default: auto
|
||||
type:
|
||||
type: string
|
||||
const: input_image
|
||||
default: input_image
|
||||
image_url:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- detail
|
||||
- type
|
||||
title: OpenAIResponseInputMessageContentImage
|
||||
OpenAIResponseInputMessageContentText:
|
||||
type: object
|
||||
properties:
|
||||
text:
|
||||
type: string
|
||||
type:
|
||||
type: string
|
||||
const: input_text
|
||||
default: input_text
|
||||
additionalProperties: false
|
||||
required:
|
||||
- text
|
||||
- type
|
||||
title: OpenAIResponseInputMessageContentText
|
||||
OpenAIResponseInputTool:
|
||||
type: object
|
||||
properties:
|
||||
type:
|
||||
oneOf:
|
||||
- type: string
|
||||
const: web_search
|
||||
- type: string
|
||||
const: web_search_preview_2025_03_11
|
||||
default: web_search
|
||||
search_context_size:
|
||||
type: string
|
||||
default: medium
|
||||
additionalProperties: false
|
||||
required:
|
||||
- type
|
||||
title: OpenAIResponseInputToolWebSearch
|
||||
CreateOpenaiResponseRequest:
|
||||
type: object
|
||||
properties:
|
||||
input:
|
||||
oneOf:
|
||||
- type: string
|
||||
- type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/OpenAIResponseInputMessage'
|
||||
description: Input message(s) to create the response.
|
||||
model:
|
||||
type: string
|
||||
description: The underlying LLM used for completions.
|
||||
previous_response_id:
|
||||
type: string
|
||||
description: >-
|
||||
(Optional) if specified, the new response will be a continuation of the
|
||||
previous response. This can be used to easily fork-off new responses from
|
||||
existing responses.
|
||||
store:
|
||||
type: boolean
|
||||
stream:
|
||||
type: boolean
|
||||
temperature:
|
||||
type: number
|
||||
tools:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/OpenAIResponseInputTool'
|
||||
additionalProperties: false
|
||||
required:
|
||||
- input
|
||||
- model
|
||||
title: CreateOpenaiResponseRequest
|
||||
OpenAIResponseError:
|
||||
type: object
|
||||
properties:
|
||||
code:
|
||||
type: string
|
||||
message:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- code
|
||||
- message
|
||||
title: OpenAIResponseError
|
||||
OpenAIResponseObject:
|
||||
type: object
|
||||
properties:
|
||||
created_at:
|
||||
type: integer
|
||||
error:
|
||||
$ref: '#/components/schemas/OpenAIResponseError'
|
||||
id:
|
||||
type: string
|
||||
model:
|
||||
type: string
|
||||
object:
|
||||
type: string
|
||||
const: response
|
||||
default: response
|
||||
output:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/OpenAIResponseOutput'
|
||||
parallel_tool_calls:
|
||||
type: boolean
|
||||
default: false
|
||||
previous_response_id:
|
||||
type: string
|
||||
status:
|
||||
type: string
|
||||
temperature:
|
||||
type: number
|
||||
top_p:
|
||||
type: number
|
||||
truncation:
|
||||
type: string
|
||||
user:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- created_at
|
||||
- id
|
||||
- model
|
||||
- object
|
||||
- output
|
||||
- parallel_tool_calls
|
||||
- status
|
||||
title: OpenAIResponseObject
|
||||
OpenAIResponseOutput:
|
||||
oneOf:
|
||||
- $ref: '#/components/schemas/OpenAIResponseOutputMessage'
|
||||
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
|
||||
discriminator:
|
||||
propertyName: type
|
||||
mapping:
|
||||
message: '#/components/schemas/OpenAIResponseOutputMessage'
|
||||
web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
|
||||
OpenAIResponseOutputMessage:
|
||||
type: object
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
content:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/OpenAIResponseOutputMessageContent'
|
||||
role:
|
||||
type: string
|
||||
const: assistant
|
||||
default: assistant
|
||||
status:
|
||||
type: string
|
||||
type:
|
||||
type: string
|
||||
const: message
|
||||
default: message
|
||||
additionalProperties: false
|
||||
required:
|
||||
- id
|
||||
- content
|
||||
- role
|
||||
- status
|
||||
- type
|
||||
title: OpenAIResponseOutputMessage
|
||||
OpenAIResponseOutputMessageContent:
|
||||
type: object
|
||||
properties:
|
||||
text:
|
||||
type: string
|
||||
type:
|
||||
type: string
|
||||
const: output_text
|
||||
default: output_text
|
||||
additionalProperties: false
|
||||
required:
|
||||
- text
|
||||
- type
|
||||
title: >-
|
||||
OpenAIResponseOutputMessageContentOutputText
|
||||
"OpenAIResponseOutputMessageWebSearchToolCall":
|
||||
type: object
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
status:
|
||||
type: string
|
||||
type:
|
||||
type: string
|
||||
const: web_search_call
|
||||
default: web_search_call
|
||||
additionalProperties: false
|
||||
required:
|
||||
- id
|
||||
- status
|
||||
- type
|
||||
title: >-
|
||||
OpenAIResponseOutputMessageWebSearchToolCall
|
||||
OpenAIResponseObjectStream:
|
||||
oneOf:
|
||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
|
||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
||||
discriminator:
|
||||
propertyName: type
|
||||
mapping:
|
||||
response.created: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
|
||||
response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
||||
"OpenAIResponseObjectStreamResponseCompleted":
|
||||
type: object
|
||||
properties:
|
||||
response:
|
||||
$ref: '#/components/schemas/OpenAIResponseObject'
|
||||
type:
|
||||
type: string
|
||||
const: response.completed
|
||||
default: response.completed
|
||||
additionalProperties: false
|
||||
required:
|
||||
- response
|
||||
- type
|
||||
title: >-
|
||||
OpenAIResponseObjectStreamResponseCompleted
|
||||
"OpenAIResponseObjectStreamResponseCreated":
|
||||
type: object
|
||||
properties:
|
||||
response:
|
||||
$ref: '#/components/schemas/OpenAIResponseObject'
|
||||
type:
|
||||
type: string
|
||||
const: response.created
|
||||
default: response.created
|
||||
additionalProperties: false
|
||||
required:
|
||||
- response
|
||||
- type
|
||||
title: >-
|
||||
OpenAIResponseObjectStreamResponseCreated
|
||||
CreateUploadSessionRequest:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -6097,7 +6460,6 @@ components:
|
|||
additionalProperties: false
|
||||
required:
|
||||
- role
|
||||
- content
|
||||
title: OpenAIAssistantMessageParam
|
||||
description: >-
|
||||
A message containing the model's (assistant) response in an OpenAI-compatible
|
||||
|
|
|
|||
907
docs/getting_started_llama_api.ipynb
Normal file
907
docs/getting_started_llama_api.ipynb
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -1,35 +1,35 @@
|
|||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=.
|
||||
set BUILDDIR=_build
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.https://www.sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
||||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=.
|
||||
set BUILDDIR=_build
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.https://www.sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
import hashlib
|
||||
import ipaddress
|
||||
import types
|
||||
import typing
|
||||
from dataclasses import make_dataclass
|
||||
from typing import Any, Dict, Set, Union
|
||||
|
|
@ -179,7 +180,7 @@ class ContentBuilder:
|
|||
"Creates the content subtree for a request or response."
|
||||
|
||||
def is_iterator_type(t):
|
||||
return "StreamChunk" in str(t)
|
||||
return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)
|
||||
|
||||
def get_media_type(t):
|
||||
if is_generic_list(t):
|
||||
|
|
@ -189,7 +190,7 @@ class ContentBuilder:
|
|||
else:
|
||||
return "application/json"
|
||||
|
||||
if typing.get_origin(payload_type) is typing.Union:
|
||||
if typing.get_origin(payload_type) in (typing.Union, types.UnionType):
|
||||
media_types = []
|
||||
item_types = []
|
||||
for x in typing.get_args(payload_type):
|
||||
|
|
|
|||
|
|
@ -1,16 +1,16 @@
|
|||
sphinx==8.1.3
|
||||
myst-parser
|
||||
linkify
|
||||
myst-parser
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
|
||||
sphinx-rtd-theme>=1.0.0
|
||||
sphinx_autobuild
|
||||
sphinx==8.1.3
|
||||
sphinx-copybutton
|
||||
sphinx-design
|
||||
sphinx-pdj-theme
|
||||
sphinx_rtd_dark_mode
|
||||
sphinx-rtd-theme>=1.0.0
|
||||
sphinx-tabs
|
||||
sphinx_autobuild
|
||||
sphinx_rtd_dark_mode
|
||||
sphinxcontrib-mermaid
|
||||
sphinxcontrib-openapi
|
||||
sphinxcontrib-redoc
|
||||
sphinxcontrib-mermaid
|
||||
sphinxcontrib-video
|
||||
tomli
|
||||
|
|
|
|||
|
|
@ -68,7 +68,8 @@ chunks_response = client.vector_io.query(
|
|||
### Using the RAG Tool
|
||||
|
||||
A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
|
||||
and automatically chunks them into smaller pieces.
|
||||
and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
|
||||
[appendix](#more-ragdocument-examples).
|
||||
|
||||
```python
|
||||
from llama_stack_client import RAGDocument
|
||||
|
|
@ -178,3 +179,38 @@ for vector_db_id in client.vector_dbs.list():
|
|||
print(f"Unregistering vector database: {vector_db_id.identifier}")
|
||||
client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
|
||||
```
|
||||
|
||||
### Appendix
|
||||
|
||||
#### More RAGDocument Examples
|
||||
```python
|
||||
from llama_stack_client import RAGDocument
|
||||
import base64
|
||||
|
||||
RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
|
||||
RAGDocument(document_id="num-1", content="plain text")
|
||||
RAGDocument(
|
||||
document_id="num-2",
|
||||
content={
|
||||
"type": "text",
|
||||
"text": "plain text input",
|
||||
}, # for inputs that should be treated as text explicitly
|
||||
)
|
||||
RAGDocument(
|
||||
document_id="num-3",
|
||||
content={
|
||||
"type": "image",
|
||||
"image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
|
||||
},
|
||||
)
|
||||
B64_ENCODED_IMAGE = base64.b64encode(
|
||||
requests.get(
|
||||
"https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
|
||||
).content
|
||||
)
|
||||
RAGDocuemnt(
|
||||
document_id="num-4",
|
||||
content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
|
||||
)
|
||||
```
|
||||
for more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
|
||||
|
|
|
|||
|
|
@ -41,30 +41,9 @@ client.toolgroups.register(
|
|||
|
||||
The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is `{"<provider_name>_api_key": <your api key>}`.
|
||||
|
||||
> **NOTE:** When using Tavily Search and Bing Search, the inference output will still display "Brave Search." This is because Llama models have been trained with Brave Search as a built-in tool. Tavily and bing is just being used in lieu of Brave search.
|
||||
|
||||
|
||||
#### Code Interpreter
|
||||
|
||||
The Code Interpreter allows execution of Python code within a controlled environment.
|
||||
|
||||
```python
|
||||
# Register Code Interpreter tool group
|
||||
client.toolgroups.register(
|
||||
toolgroup_id="builtin::code_interpreter", provider_id="code_interpreter"
|
||||
)
|
||||
```
|
||||
|
||||
Features:
|
||||
- Secure execution environment using `bwrap` sandboxing
|
||||
- Matplotlib support for generating plots
|
||||
- Disabled dangerous system operations
|
||||
- Configurable execution timeouts
|
||||
|
||||
> ⚠️ Important: The code interpreter tool can operate in a controlled environment locally or on Podman containers. To ensure proper functionality in containerized environments:
|
||||
> - The container requires privileged access (e.g., --privileged).
|
||||
> - Users without sufficient permissions may encounter permission errors. (`bwrap: Can't mount devpts on /newroot/dev/pts: Permission denied`)
|
||||
> - 🔒 Security Warning: Privileged mode grants elevated access and bypasses security restrictions. Use only in local, isolated, or controlled environments.
|
||||
|
||||
#### WolframAlpha
|
||||
|
||||
The WolframAlpha tool provides access to computational knowledge through the WolframAlpha API.
|
||||
|
|
@ -102,7 +81,7 @@ Features:
|
|||
- Context retrieval with token limits
|
||||
|
||||
|
||||
> **Note:** By default, llama stack run.yaml defines toolgroups for web search, code interpreter and rag, that are provided by tavily-search, code-interpreter and rag providers.
|
||||
> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
|
||||
|
||||
## Model Context Protocol (MCP) Tools
|
||||
|
||||
|
|
@ -214,3 +193,69 @@ response = agent.create_turn(
|
|||
session_id=session_id,
|
||||
)
|
||||
```
|
||||
## Simple Example 2: Using an Agent with the Web Search Tool
|
||||
1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
|
||||
2. [Optional] Provide the API key directly to the Llama Stack server
|
||||
```bash
|
||||
export TAVILY_SEARCH_API_KEY="your key"
|
||||
```
|
||||
```bash
|
||||
--env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
|
||||
```
|
||||
3. Run the following script.
|
||||
```python
|
||||
from llama_stack_client.lib.agents.agent import Agent
|
||||
from llama_stack_client.types.agent_create_params import AgentConfig
|
||||
from llama_stack_client.lib.agents.event_logger import EventLogger
|
||||
from llama_stack_client import LlamaStackClient
|
||||
|
||||
client = LlamaStackClient(
|
||||
base_url=f"http://localhost:8321",
|
||||
provider_data={
|
||||
"tavily_search_api_key": "your_TAVILY_SEARCH_API_KEY"
|
||||
}, # Set this from the client side. No need to provide it if it has already been configured on the Llama Stack server.
|
||||
)
|
||||
|
||||
agent = Agent(
|
||||
client,
|
||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||
instructions=(
|
||||
"You are a web search assistant, must use websearch tool to look up the most current and precise information available. "
|
||||
),
|
||||
tools=["builtin::websearch"],
|
||||
)
|
||||
|
||||
session_id = agent.create_session("websearch-session")
|
||||
|
||||
response = agent.create_turn(
|
||||
messages=[
|
||||
{"role": "user", "content": "How did the USA perform in the last Olympics?"}
|
||||
],
|
||||
session_id=session_id,
|
||||
)
|
||||
for log in EventLogger().log(response):
|
||||
log.print()
|
||||
```
|
||||
|
||||
## Simple Example3: Using an Agent with the WolframAlpha Tool
|
||||
1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
|
||||
2. Provide the API key either when starting the Llama Stack server:
|
||||
```bash
|
||||
--env WOLFRAM_ALPHA_API_KEY=${WOLFRAM_ALPHA_API_KEY}
|
||||
```
|
||||
or from the client side:
|
||||
```python
|
||||
client = LlamaStackClient(
|
||||
base_url="http://localhost:8321",
|
||||
provider_data={"wolfram_alpha_api_key": wolfram_api_key},
|
||||
)
|
||||
```
|
||||
3. Configure the tools in the Agent by setting `tools=["builtin::wolfram_alpha"]`.
|
||||
4. Example user query:
|
||||
```python
|
||||
response = agent.create_turn(
|
||||
messages=[{"role": "user", "content": "Solve x^2 + 2x + 1 = 0 using WolframAlpha"}],
|
||||
session_id=session_id,
|
||||
)
|
||||
```
|
||||
```
|
||||
|
|
|
|||
|
|
@ -109,8 +109,6 @@ llama stack build --list-templates
|
|||
+------------------------------+-----------------------------------------------------------------------------+
|
||||
| nvidia | Use NVIDIA NIM for running LLM inference |
|
||||
+------------------------------+-----------------------------------------------------------------------------+
|
||||
| meta-reference-quantized-gpu | Use Meta Reference with fp8, int4 quantization for running LLM inference |
|
||||
+------------------------------+-----------------------------------------------------------------------------+
|
||||
| cerebras | Use Cerebras for running LLM inference |
|
||||
+------------------------------+-----------------------------------------------------------------------------+
|
||||
| ollama | Use (an external) Ollama server for running LLM inference |
|
||||
|
|
@ -176,7 +174,11 @@ distribution_spec:
|
|||
safety: inline::llama-guard
|
||||
agents: inline::meta-reference
|
||||
telemetry: inline::meta-reference
|
||||
image_name: ollama
|
||||
image_type: conda
|
||||
|
||||
# If some providers are external, you can specify the path to the implementation
|
||||
external_providers_dir: /etc/llama-stack/providers.d
|
||||
```
|
||||
|
||||
```
|
||||
|
|
@ -184,6 +186,57 @@ llama stack build --config llama_stack/templates/ollama/build.yaml
|
|||
```
|
||||
:::
|
||||
|
||||
:::{tab-item} Building with External Providers
|
||||
|
||||
Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.
|
||||
|
||||
To build a distribution with external providers, you need to:
|
||||
|
||||
1. Configure the `external_providers_dir` in your build configuration file:
|
||||
|
||||
```yaml
|
||||
# Example my-external-stack.yaml with external providers
|
||||
version: '2'
|
||||
distribution_spec:
|
||||
description: Custom distro for CI tests
|
||||
providers:
|
||||
inference:
|
||||
- remote::custom_ollama
|
||||
# Add more providers as needed
|
||||
image_type: container
|
||||
image_name: ci-test
|
||||
# Path to external provider implementations
|
||||
external_providers_dir: /etc/llama-stack/providers.d
|
||||
```
|
||||
|
||||
Here's an example for a custom Ollama provider:
|
||||
|
||||
```yaml
|
||||
adapter:
|
||||
adapter_type: custom_ollama
|
||||
pip_packages:
|
||||
- ollama
|
||||
- aiohttp
|
||||
- llama-stack-provider-ollama # This is the provider package
|
||||
config_class: llama_stack_ollama_provider.config.OllamaImplConfig
|
||||
module: llama_stack_ollama_provider
|
||||
api_dependencies: []
|
||||
optional_api_dependencies: []
|
||||
```
|
||||
|
||||
The `pip_packages` section lists the Python packages required by the provider, as well as the
|
||||
provider package itself. The package must be available on PyPI or can be provided from a local
|
||||
directory or a git repository (git must be installed on the build environment).
|
||||
|
||||
2. Build your distribution using the config file:
|
||||
|
||||
```
|
||||
llama stack build --config my-external-stack.yaml
|
||||
```
|
||||
|
||||
For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external.md).
|
||||
:::
|
||||
|
||||
:::{tab-item} Building Container
|
||||
|
||||
```{admonition} Podman Alternative
|
||||
|
|
|
|||
|
|
@ -53,6 +53,13 @@ models:
|
|||
provider_id: ollama
|
||||
provider_model_id: null
|
||||
shields: []
|
||||
server:
|
||||
port: 8321
|
||||
auth:
|
||||
provider_type: "kubernetes"
|
||||
config:
|
||||
api_server_url: "https://kubernetes.default.svc"
|
||||
ca_cert_path: "/path/to/ca.crt"
|
||||
```
|
||||
|
||||
Let's break this down into the different sections. The first section specifies the set of APIs that the stack server will serve:
|
||||
|
|
@ -102,6 +109,105 @@ A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and i
|
|||
|
||||
What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.
|
||||
|
||||
## Server Configuration
|
||||
|
||||
The `server` section configures the HTTP server that serves the Llama Stack APIs:
|
||||
|
||||
```yaml
|
||||
server:
|
||||
port: 8321 # Port to listen on (default: 8321)
|
||||
tls_certfile: "/path/to/cert.pem" # Optional: Path to TLS certificate for HTTPS
|
||||
tls_keyfile: "/path/to/key.pem" # Optional: Path to TLS key for HTTPS
|
||||
auth: # Optional: Authentication configuration
|
||||
provider_type: "kubernetes" # Type of auth provider
|
||||
config: # Provider-specific configuration
|
||||
api_server_url: "https://kubernetes.default.svc"
|
||||
ca_cert_path: "/path/to/ca.crt" # Optional: Path to CA certificate
|
||||
```
|
||||
|
||||
### Authentication Configuration
|
||||
|
||||
The `auth` section configures authentication for the server. When configured, all API requests must include a valid Bearer token in the Authorization header:
|
||||
|
||||
```
|
||||
Authorization: Bearer <token>
|
||||
```
|
||||
|
||||
The server supports multiple authentication providers:
|
||||
|
||||
#### Kubernetes Provider
|
||||
|
||||
The Kubernetes cluster must be configured to use a service account for authentication.
|
||||
|
||||
```bash
|
||||
kubectl create namespace llama-stack
|
||||
kubectl create serviceaccount llama-stack-auth -n llama-stack
|
||||
kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
|
||||
kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
|
||||
```
|
||||
|
||||
Validates tokens against the Kubernetes API server:
|
||||
```yaml
|
||||
server:
|
||||
auth:
|
||||
provider_type: "kubernetes"
|
||||
config:
|
||||
api_server_url: "https://kubernetes.default.svc" # URL of the Kubernetes API server
|
||||
ca_cert_path: "/path/to/ca.crt" # Optional: Path to CA certificate
|
||||
```
|
||||
|
||||
The provider extracts user information from the JWT token:
|
||||
- Username from the `sub` claim becomes a role
|
||||
- Kubernetes groups become teams
|
||||
|
||||
You can easily validate a request by running:
|
||||
|
||||
```bash
|
||||
curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
|
||||
```
|
||||
|
||||
#### Custom Provider
|
||||
Validates tokens against a custom authentication endpoint:
|
||||
```yaml
|
||||
server:
|
||||
auth:
|
||||
provider_type: "custom"
|
||||
config:
|
||||
endpoint: "https://auth.example.com/validate" # URL of the auth endpoint
|
||||
```
|
||||
|
||||
The custom endpoint receives a POST request with:
|
||||
```json
|
||||
{
|
||||
"api_key": "<token>",
|
||||
"request": {
|
||||
"path": "/api/v1/endpoint",
|
||||
"headers": {
|
||||
"content-type": "application/json",
|
||||
"user-agent": "curl/7.64.1"
|
||||
},
|
||||
"params": {
|
||||
"key": ["value"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
And must respond with:
|
||||
```json
|
||||
{
|
||||
"access_attributes": {
|
||||
"roles": ["admin", "user"],
|
||||
"teams": ["ml-team", "nlp-team"],
|
||||
"projects": ["llama-3", "project-x"],
|
||||
"namespaces": ["research"]
|
||||
},
|
||||
"message": "Authentication successful"
|
||||
}
|
||||
```
|
||||
|
||||
If no access attributes are returned, the token is used as a namespace.
|
||||
|
||||
## Extending to handle Safety
|
||||
|
||||
Configuring Safety can be a little involved so it is instructive to go through an example.
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlama
|
|||
Add the following dependency in your `build.gradle.kts` file:
|
||||
```
|
||||
dependencies {
|
||||
implementation("com.llama.llamastack:llama-stack-client-kotlin:0.1.4.2")
|
||||
implementation("com.llama.llamastack:llama-stack-client-kotlin:0.2.2")
|
||||
}
|
||||
```
|
||||
This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
|
||||
|
|
@ -37,11 +37,7 @@ For local inferencing, it is required to include the ExecuTorch library into you
|
|||
|
||||
Include the ExecuTorch library by:
|
||||
1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
|
||||
2. Move the script to the top level of your Android app where the app directory resides:
|
||||
<p align="center">
|
||||
<img src="https://github.com/meta-llama/llama-stack-client-kotlin/blob/latest-release/doc/img/example_android_app_directory.png" style="width:300px">
|
||||
</p>
|
||||
|
||||
2. Move the script to the top level of your Android app where the `app` directory resides.
|
||||
3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
|
||||
4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
|
||||
```
|
||||
|
|
@ -52,6 +48,8 @@ dependencies {
|
|||
}
|
||||
```
|
||||
|
||||
See other dependencies for the local RAG in Android app [README](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#quick-start).
|
||||
|
||||
## Llama Stack APIs in Your Android App
|
||||
Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library.
|
||||
|
||||
|
|
@ -60,7 +58,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
|
|||
```
|
||||
conda create -n stack-fireworks python=3.10
|
||||
conda activate stack-fireworks
|
||||
pip install --no-cache llama-stack==0.1.4
|
||||
pip install --no-cache llama-stack==0.2.2
|
||||
llama stack build --template fireworks --image-type conda
|
||||
export FIREWORKS_API_KEY=<SOME_KEY>
|
||||
llama stack run fireworks --port 5050
|
||||
|
|
|
|||
|
|
@ -1,88 +0,0 @@
|
|||
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
|
||||
# NVIDIA Distribution
|
||||
|
||||
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
|
||||
|
||||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::nvidia` |
|
||||
| post_training | `remote::nvidia` |
|
||||
| safety | `remote::nvidia` |
|
||||
| scoring | `inline::basic` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `inline::rag-runtime` |
|
||||
| vector_io | `inline::faiss` |
|
||||
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
||||
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
|
||||
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
|
||||
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
|
||||
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
|
||||
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
||||
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
||||
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
||||
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
||||
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
||||
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
|
||||
- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
|
||||
- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
|
||||
- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
|
||||
- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
|
||||
- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
|
||||
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
||||
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
||||
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
||||
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
||||
- `nvidia/nv-embedqa-e5-v5 `
|
||||
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
||||
- `snowflake/arctic-embed-l `
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
||||
|
||||
|
||||
## Running Llama Stack with NVIDIA
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=8321
|
||||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-nvidia \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
```
|
||||
|
||||
### Via Conda
|
||||
|
||||
```bash
|
||||
llama stack build --template nvidia --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
88
docs/source/distributions/remote_hosted_distro/watsonx.md
Normal file
88
docs/source/distributions/remote_hosted_distro/watsonx.md
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
---
|
||||
orphan: true
|
||||
---
|
||||
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
|
||||
# watsonx Distribution
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
self
|
||||
```
|
||||
|
||||
The `llamastack/distribution-watsonx` distribution consists of the following provider configurations.
|
||||
|
||||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::watsonx` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| vector_io | `inline::faiss` |
|
||||
|
||||
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
||||
- `WATSONX_API_KEY`: watsonx API Key (default: ``)
|
||||
- `WATSONX_PROJECT_ID`: watsonx Project ID (default: ``)
|
||||
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
- `meta-llama/llama-3-3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
|
||||
- `meta-llama/llama-2-13b-chat (aliases: meta-llama/Llama-2-13b)`
|
||||
- `meta-llama/llama-3-1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
|
||||
- `meta-llama/llama-3-1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
|
||||
- `meta-llama/llama-3-2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
||||
- `meta-llama/llama-3-2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
|
||||
- `meta-llama/llama-3-2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
||||
- `meta-llama/llama-3-2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
||||
- `meta-llama/llama-guard-3-11b-vision (aliases: meta-llama/Llama-Guard-3-11B-Vision)`
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
|
||||
Make sure you have access to a watsonx API Key. You can get one by referring [watsonx.ai](https://www.ibm.com/docs/en/masv-and-l/maximo-manage/continuous-delivery?topic=setup-create-watsonx-api-key).
|
||||
|
||||
|
||||
## Running Llama Stack with watsonx
|
||||
|
||||
You can do this via Conda (build code), venv or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-watsonx \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env WATSONX_API_KEY=$WATSONX_API_KEY \
|
||||
--env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
|
||||
--env WATSONX_BASE_URL=$WATSONX_BASE_URL
|
||||
```
|
||||
|
||||
### Via Conda
|
||||
|
||||
```bash
|
||||
llama stack build --template watsonx --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env WATSONX_API_KEY=$WATSONX_API_KEY \
|
||||
--env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID
|
||||
```
|
||||
|
|
@ -19,7 +19,7 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro
|
|||
| safety | `remote::bedrock` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr
|
|||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
|
|||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ The `llamastack/distribution-groq` distribution consists of the following provid
|
|||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` |
|
||||
| vector_io | `inline::faiss` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
|
|||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
|
@ -81,6 +81,7 @@ LLAMA_STACK_PORT=8321
|
|||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
--gpu all \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-meta-reference-gpu \
|
||||
|
|
@ -94,6 +95,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
|||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
--gpu all \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-meta-reference-gpu \
|
||||
|
|
|
|||
|
|
@ -1,123 +0,0 @@
|
|||
---
|
||||
orphan: true
|
||||
---
|
||||
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
|
||||
# Meta Reference Quantized Distribution
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
self
|
||||
```
|
||||
|
||||
The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations:
|
||||
|
||||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `inline::meta-reference-quantized` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
|
||||
|
||||
Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
||||
- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
|
||||
|
||||
|
||||
## Prerequisite: Downloading Models
|
||||
|
||||
Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
|
||||
|
||||
```
|
||||
$ llama model list --downloaded
|
||||
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ Model ┃ Size ┃ Modified Time ┃
|
||||
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Llama3.2-1B-Instruct:int4-qlora-eo8 │ 1.53 GB │ 2025-02-26 11:22:28 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama3.2-1B │ 2.31 GB │ 2025-02-18 21:48:52 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Prompt-Guard-86M │ 0.02 GB │ 2025-02-26 11:29:28 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB │ 2025-02-26 11:37:41 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama3.2-3B │ 5.99 GB │ 2025-02-18 21:51:26 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama3.1-8B │ 14.97 GB │ 2025-02-16 10:36:37 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB │ 2025-02-26 11:35:02 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama-Guard-3-1B │ 2.80 GB │ 2025-02-26 11:20:46 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama-Guard-3-1B:int4 │ 0.43 GB │ 2025-02-26 11:33:33 │
|
||||
└─────────────────────────────────────────┴──────────┴─────────────────────┘
|
||||
```
|
||||
|
||||
## Running the Distribution
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=8321
|
||||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-meta-reference-quantized-gpu \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-meta-reference-quantized-gpu \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
```
|
||||
|
||||
### Via Conda
|
||||
|
||||
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
|
||||
|
||||
```bash
|
||||
llama stack build --template meta-reference-quantized-gpu --image-type conda
|
||||
llama stack run distributions/meta-reference-quantized-gpu/run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
llama stack run distributions/meta-reference-quantized-gpu/run-with-safety.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
```
|
||||
|
|
@ -6,8 +6,8 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|
|||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| datasetio | `inline::localfs`, `remote::nvidia` |
|
||||
| eval | `remote::nvidia` |
|
||||
| inference | `remote::nvidia` |
|
||||
| post_training | `remote::nvidia` |
|
||||
| safety | `remote::nvidia` |
|
||||
|
|
@ -22,13 +22,13 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|
|||
The following environment variables can be configured:
|
||||
|
||||
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
||||
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
|
||||
- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
|
||||
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
|
||||
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
|
||||
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
|
||||
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
||||
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
||||
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
||||
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
|
||||
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
||||
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
||||
|
||||
|
|
@ -45,20 +45,91 @@ The following models are available by default:
|
|||
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
||||
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
||||
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
||||
- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
|
||||
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
||||
- `nvidia/nv-embedqa-e5-v5 `
|
||||
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
||||
- `snowflake/arctic-embed-l `
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
## Prerequisites
|
||||
### NVIDIA API Keys
|
||||
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
||||
|
||||
### Deploy NeMo Microservices Platform
|
||||
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||
|
||||
## Supported Services
|
||||
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
||||
|
||||
### Inference: NVIDIA NIM
|
||||
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
||||
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
||||
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
||||
|
||||
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
||||
|
||||
### Datasetio API: NeMo Data Store
|
||||
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
||||
|
||||
See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Eval API: NeMo Evaluator
|
||||
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Post-Training API: NeMo Customizer
|
||||
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Safety API: NeMo Guardrails
|
||||
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the NVIDIA Safety docs for supported features and example usage.
|
||||
|
||||
## Deploying models
|
||||
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
||||
|
||||
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
||||
```sh
|
||||
# URL to NeMo NIM Proxy service
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"name": "llama-3.2-1b-instruct",
|
||||
"namespace": "meta",
|
||||
"config": {
|
||||
"model": "meta/llama-3.2-1b-instruct",
|
||||
"nim_deployment": {
|
||||
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
||||
"image_tag": "1.8.3",
|
||||
"pvc_size": "25Gi",
|
||||
"gpu": 1,
|
||||
"additional_envs": {
|
||||
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
||||
}
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
|
||||
|
||||
You can also remove a deployed NIM to free up GPU resources, if needed.
|
||||
```sh
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
||||
```
|
||||
|
||||
## Running Llama Stack with NVIDIA
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
You can do this via Conda or venv (build code), or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
|
|
@ -80,9 +151,23 @@ docker run \
|
|||
### Via Conda
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||
llama stack build --template nvidia --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
### Via venv
|
||||
|
||||
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||
llama stack build --template nvidia --image-type venv
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
|
|||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ The `llamastack/distribution-passthrough` distribution consists of the following
|
|||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
|
|||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
|
@ -41,10 +41,10 @@ The following environment variables can be configured:
|
|||
|
||||
## Setting up vLLM server
|
||||
|
||||
In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
|
||||
In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
|
||||
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
|
||||
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
|
||||
that we only use GPUs here for demonstration purposes.
|
||||
that we only use GPUs here for demonstration purposes. Note that if you run into issues, you can include the environment variable `--env VLLM_DEBUG_LOG_API_SERVER_RESPONSE=true` (available in vLLM v0.8.3 and above) in the `docker run` command to enable log response from API server for debugging.
|
||||
|
||||
### Setting up vLLM server on AMD GPU
|
||||
|
||||
|
|
@ -162,6 +162,55 @@ docker run \
|
|||
--port $SAFETY_PORT
|
||||
```
|
||||
|
||||
### Setting up vLLM server on Intel GPU
|
||||
|
||||
Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
|
||||
- [intel/vllm](https://hub.docker.com/r/intel/vllm)
|
||||
|
||||
Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
|
||||
|
||||
```bash
|
||||
export INFERENCE_PORT=8000
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
|
||||
export ZE_AFFINITY_MASK=0
|
||||
|
||||
docker run \
|
||||
--pull always \
|
||||
--device /dev/dri \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
|
||||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||
--ipc=host \
|
||||
intel/vllm:xpu \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--model $INFERENCE_MODEL \
|
||||
--port $INFERENCE_PORT
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
||||
|
||||
```bash
|
||||
export SAFETY_PORT=8081
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
export ZE_AFFINITY_MASK=1
|
||||
|
||||
docker run \
|
||||
--pull always \
|
||||
--device /dev/dri \
|
||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
|
||||
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||
--ipc=host \
|
||||
intel/vllm:xpu \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--model $SAFETY_MODEL \
|
||||
--port $SAFETY_PORT
|
||||
```
|
||||
|
||||
## Running Llama Stack
|
||||
|
||||
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
|
|||
| inference | `remote::sambanova`, `inline::sentence-transformers` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
|
|||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ The `llamastack/distribution-together` distribution consists of the following pr
|
|||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -50,9 +50,11 @@ Llama Stack supports two types of external providers:
|
|||
|
||||
Here's a list of known external providers that you can use with Llama Stack:
|
||||
|
||||
| Type | Name | Description | Repository |
|
||||
|------|------|-------------|------------|
|
||||
| Remote | KubeFlow Training | Train models with KubeFlow | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
|
||||
| Name | Description | API | Type | Repository |
|
||||
|------|-------------|-----|------|------------|
|
||||
| KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
|
||||
| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
|
||||
| RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
|
||||
|
||||
### Remote Provider Specification
|
||||
|
||||
|
|
|
|||
|
|
@ -389,5 +389,7 @@
|
|||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
|
|
|||
|
|
@ -256,5 +256,7 @@
|
|||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
|
|
|||
|
|
@ -301,5 +301,7 @@
|
|||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.2"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
|
|
|||
|
|
@ -200,5 +200,7 @@
|
|||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.2"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
|
|
|||
|
|
@ -355,5 +355,7 @@
|
|||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
|
|
|||
|
|
@ -398,5 +398,7 @@
|
|||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
|
|
|||
|
|
@ -132,5 +132,7 @@
|
|||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.10"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
|
|
|||
|
|
@ -188,5 +188,7 @@
|
|||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
|
|
|||
|
|
@ -86,11 +86,11 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
|
|||
llama stack build --template ollama --image-type conda
|
||||
```
|
||||
**Expected Output:**
|
||||
```
|
||||
```bash
|
||||
...
|
||||
Build Successful! Next steps:
|
||||
1. Set the environment variables: LLAMA_STACK_PORT, OLLAMA_URL, INFERENCE_MODEL, SAFETY_MODEL
|
||||
2. `llama stack run /Users/<username>/.llama/distributions/llamastack-ollama/ollama-run.yaml
|
||||
Build Successful!
|
||||
You can find the newly-built template here: ~/.llama/distributions/ollama/ollama-run.yaml
|
||||
You can run the new Llama Stack Distro via: llama stack run ~/.llama/distributions/ollama/ollama-run.yaml --image-type conda
|
||||
```
|
||||
|
||||
3. **Set the ENV variables by exporting them to the terminal**:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue