mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-07 12:47:37 +00:00
Merge branch 'main' into fix/vector-db-mandatory-provider-id
This commit is contained in:
commit
4374da02f3
243 changed files with 21774 additions and 17408 deletions
101
docs/_static/css/my_theme.css
vendored
101
docs/_static/css/my_theme.css
vendored
|
@ -1,5 +1,106 @@
|
|||
@import url("theme.css");
|
||||
|
||||
/* Horizontal Navigation Bar */
|
||||
.horizontal-nav {
|
||||
background-color: #ffffff;
|
||||
border-bottom: 1px solid #e5e5e5;
|
||||
padding: 0;
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
z-index: 1050;
|
||||
height: 50px;
|
||||
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
||||
}
|
||||
|
||||
[data-theme="dark"] .horizontal-nav {
|
||||
background-color: #1a1a1a;
|
||||
border-bottom: 1px solid #333;
|
||||
}
|
||||
|
||||
.horizontal-nav .nav-container {
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
padding: 0 20px;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
.horizontal-nav .nav-brand {
|
||||
font-size: 18px;
|
||||
font-weight: 600;
|
||||
color: #333;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
[data-theme="dark"] .horizontal-nav .nav-brand {
|
||||
color: #fff;
|
||||
}
|
||||
|
||||
.horizontal-nav .nav-links {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 30px;
|
||||
list-style: none;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.horizontal-nav .nav-links a {
|
||||
color: #666;
|
||||
text-decoration: none;
|
||||
font-size: 14px;
|
||||
font-weight: 500;
|
||||
padding: 8px 12px;
|
||||
border-radius: 6px;
|
||||
transition: all 0.2s ease;
|
||||
}
|
||||
|
||||
.horizontal-nav .nav-links a:hover,
|
||||
.horizontal-nav .nav-links a.active {
|
||||
color: #333;
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
.horizontal-nav .nav-links a.active {
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
[data-theme="dark"] .horizontal-nav .nav-links a {
|
||||
color: #ccc;
|
||||
}
|
||||
|
||||
[data-theme="dark"] .horizontal-nav .nav-links a:hover,
|
||||
[data-theme="dark"] .horizontal-nav .nav-links a.active {
|
||||
color: #fff;
|
||||
background-color: #333;
|
||||
}
|
||||
|
||||
.horizontal-nav .nav-links .github-link {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
}
|
||||
|
||||
.horizontal-nav .nav-links .github-icon {
|
||||
width: 16px;
|
||||
height: 16px;
|
||||
fill: currentColor;
|
||||
}
|
||||
|
||||
/* Adjust main content to account for fixed nav */
|
||||
.wy-nav-side {
|
||||
top: 50px;
|
||||
height: calc(100vh - 50px);
|
||||
}
|
||||
|
||||
.wy-nav-content-wrap {
|
||||
margin-top: 50px;
|
||||
}
|
||||
|
||||
.wy-nav-content {
|
||||
max-width: 90%;
|
||||
}
|
||||
|
|
44
docs/_static/js/horizontal_nav.js
vendored
Normal file
44
docs/_static/js/horizontal_nav.js
vendored
Normal file
|
@ -0,0 +1,44 @@
|
|||
// Horizontal Navigation Bar for Llama Stack Documentation
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
// Create the horizontal navigation HTML
|
||||
const navHTML = `
|
||||
<nav class="horizontal-nav">
|
||||
<div class="nav-container">
|
||||
<a href="/" class="nav-brand">Llama Stack</a>
|
||||
<ul class="nav-links">
|
||||
<li><a href="/">Docs</a></li>
|
||||
<li><a href="/references/api_reference/">API Reference</a></li>
|
||||
<li><a href="https://github.com/meta-llama/llama-stack" target="_blank" class="github-link">
|
||||
<svg class="github-icon" viewBox="0 0 16 16" aria-hidden="true">
|
||||
<path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/>
|
||||
</svg>
|
||||
GitHub
|
||||
</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav>
|
||||
`;
|
||||
|
||||
// Insert the navigation at the beginning of the body
|
||||
document.body.insertAdjacentHTML('afterbegin', navHTML);
|
||||
|
||||
// Update navigation links based on current page
|
||||
updateActiveNav();
|
||||
});
|
||||
|
||||
function updateActiveNav() {
|
||||
const currentPath = window.location.pathname;
|
||||
const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a');
|
||||
|
||||
navLinks.forEach(link => {
|
||||
// Remove any existing active classes
|
||||
link.classList.remove('active');
|
||||
|
||||
// Add active class based on current path
|
||||
if (currentPath === '/' && link.getAttribute('href') === '/') {
|
||||
link.classList.add('active');
|
||||
} else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) {
|
||||
link.classList.add('active');
|
||||
}
|
||||
});
|
||||
}
|
457
docs/_static/llama-stack-spec.html
vendored
457
docs/_static/llama-stack-spec.html
vendored
|
@ -633,6 +633,80 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/prompts": {
|
||||
"get": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "A ListPromptsResponse containing all prompts.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ListPromptsResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Prompts"
|
||||
],
|
||||
"description": "List all prompts.",
|
||||
"parameters": []
|
||||
},
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "The created Prompt resource.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/Prompt"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Prompts"
|
||||
],
|
||||
"description": "Create a new prompt.",
|
||||
"parameters": [],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/CreatePromptRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/agents/{agent_id}": {
|
||||
"get": {
|
||||
"responses": {
|
||||
|
@ -901,6 +975,143 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"/v1/prompts/{prompt_id}": {
|
||||
"get": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "A Prompt resource.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/Prompt"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Prompts"
|
||||
],
|
||||
"description": "Get a prompt by its identifier and optional version.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "prompt_id",
|
||||
"in": "path",
|
||||
"description": "The identifier of the prompt to get.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "version",
|
||||
"in": "query",
|
||||
"description": "The version of the prompt to get (defaults to latest).",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "The updated Prompt resource with incremented version.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/Prompt"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Prompts"
|
||||
],
|
||||
"description": "Update an existing prompt (increments version).",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "prompt_id",
|
||||
"in": "path",
|
||||
"description": "The identifier of the prompt to update.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/UpdatePromptRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
},
|
||||
"delete": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK"
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Prompts"
|
||||
],
|
||||
"description": "Delete a prompt.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "prompt_id",
|
||||
"in": "path",
|
||||
"description": "The identifier of the prompt to delete.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"/v1/inference/embeddings": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
@ -2836,6 +3047,49 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"/v1/prompts/{prompt_id}/versions": {
|
||||
"get": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "A ListPromptsResponse containing all versions of the prompt.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ListPromptsResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Prompts"
|
||||
],
|
||||
"description": "List all versions of a specific prompt.",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "prompt_id",
|
||||
"in": "path",
|
||||
"description": "The identifier of the prompt to list versions for.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"/v1/providers": {
|
||||
"get": {
|
||||
"responses": {
|
||||
|
@ -5007,6 +5261,59 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/v1/prompts/{prompt_id}/set-default-version": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "The prompt with the specified version now set as default.",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/Prompt"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"400": {
|
||||
"$ref": "#/components/responses/BadRequest400"
|
||||
},
|
||||
"429": {
|
||||
"$ref": "#/components/responses/TooManyRequests429"
|
||||
},
|
||||
"500": {
|
||||
"$ref": "#/components/responses/InternalServerError500"
|
||||
},
|
||||
"default": {
|
||||
"$ref": "#/components/responses/DefaultError"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Prompts"
|
||||
],
|
||||
"description": "Set which version of a prompt should be the default in get_prompt (latest).",
|
||||
"parameters": [
|
||||
{
|
||||
"name": "prompt_id",
|
||||
"in": "path",
|
||||
"description": "The identifier of the prompt.",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/SetDefaultVersionRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/post-training/supervised-fine-tune": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
@ -9670,6 +9977,65 @@
|
|||
],
|
||||
"title": "OpenAIResponseObjectStreamResponseWebSearchCallSearching"
|
||||
},
|
||||
"CreatePromptRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt": {
|
||||
"type": "string",
|
||||
"description": "The prompt text content with variable placeholders."
|
||||
},
|
||||
"variables": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "List of variable names that can be used in the prompt template."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"prompt"
|
||||
],
|
||||
"title": "CreatePromptRequest"
|
||||
},
|
||||
"Prompt": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt": {
|
||||
"type": "string",
|
||||
"description": "The system prompt text with variable placeholders. Variables are only supported when using the Responses API."
|
||||
},
|
||||
"version": {
|
||||
"type": "integer",
|
||||
"description": "Version (integer starting at 1, incremented on save)"
|
||||
},
|
||||
"prompt_id": {
|
||||
"type": "string",
|
||||
"description": "Unique identifier formatted as 'pmpt_<48-digit-hash>'"
|
||||
},
|
||||
"variables": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "List of prompt variable names that can be used in the prompt template"
|
||||
},
|
||||
"is_default": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "Boolean indicating whether this version is the default version for this prompt"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"version",
|
||||
"prompt_id",
|
||||
"variables",
|
||||
"is_default"
|
||||
],
|
||||
"title": "Prompt",
|
||||
"description": "A prompt resource representing a stored OpenAI Compatible prompt template in Llama Stack."
|
||||
},
|
||||
"OpenAIDeleteResponseObject": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -10296,7 +10662,8 @@
|
|||
"scoring_function",
|
||||
"benchmark",
|
||||
"tool",
|
||||
"tool_group"
|
||||
"tool_group",
|
||||
"prompt"
|
||||
],
|
||||
"const": "benchmark",
|
||||
"default": "benchmark",
|
||||
|
@ -10923,7 +11290,8 @@
|
|||
"scoring_function",
|
||||
"benchmark",
|
||||
"tool",
|
||||
"tool_group"
|
||||
"tool_group",
|
||||
"prompt"
|
||||
],
|
||||
"const": "dataset",
|
||||
"default": "dataset",
|
||||
|
@ -11073,7 +11441,8 @@
|
|||
"scoring_function",
|
||||
"benchmark",
|
||||
"tool",
|
||||
"tool_group"
|
||||
"tool_group",
|
||||
"prompt"
|
||||
],
|
||||
"const": "model",
|
||||
"default": "model",
|
||||
|
@ -11338,7 +11707,8 @@
|
|||
"scoring_function",
|
||||
"benchmark",
|
||||
"tool",
|
||||
"tool_group"
|
||||
"tool_group",
|
||||
"prompt"
|
||||
],
|
||||
"const": "scoring_function",
|
||||
"default": "scoring_function",
|
||||
|
@ -11446,7 +11816,8 @@
|
|||
"scoring_function",
|
||||
"benchmark",
|
||||
"tool",
|
||||
"tool_group"
|
||||
"tool_group",
|
||||
"prompt"
|
||||
],
|
||||
"const": "shield",
|
||||
"default": "shield",
|
||||
|
@ -11691,7 +12062,8 @@
|
|||
"scoring_function",
|
||||
"benchmark",
|
||||
"tool",
|
||||
"tool_group"
|
||||
"tool_group",
|
||||
"prompt"
|
||||
],
|
||||
"const": "tool",
|
||||
"default": "tool",
|
||||
|
@ -11773,7 +12145,8 @@
|
|||
"scoring_function",
|
||||
"benchmark",
|
||||
"tool",
|
||||
"tool_group"
|
||||
"tool_group",
|
||||
"prompt"
|
||||
],
|
||||
"const": "tool_group",
|
||||
"default": "tool_group",
|
||||
|
@ -12067,7 +12440,8 @@
|
|||
"scoring_function",
|
||||
"benchmark",
|
||||
"tool",
|
||||
"tool_group"
|
||||
"tool_group",
|
||||
"prompt"
|
||||
],
|
||||
"const": "vector_db",
|
||||
"default": "vector_db",
|
||||
|
@ -12882,6 +13256,23 @@
|
|||
"title": "OpenAIResponseObjectWithInput",
|
||||
"description": "OpenAI response object extended with input context information."
|
||||
},
|
||||
"ListPromptsResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/Prompt"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"data"
|
||||
],
|
||||
"title": "ListPromptsResponse",
|
||||
"description": "Response model to list prompts."
|
||||
},
|
||||
"ListProvidersResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -17129,6 +17520,20 @@
|
|||
"title": "ScoreBatchResponse",
|
||||
"description": "Response from batch scoring operations on datasets."
|
||||
},
|
||||
"SetDefaultVersionRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"version": {
|
||||
"type": "integer",
|
||||
"description": "The version to set as default."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"version"
|
||||
],
|
||||
"title": "SetDefaultVersionRequest"
|
||||
},
|
||||
"AlgorithmConfig": {
|
||||
"oneOf": [
|
||||
{
|
||||
|
@ -17413,6 +17818,37 @@
|
|||
"title": "SyntheticDataGenerationResponse",
|
||||
"description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."
|
||||
},
|
||||
"UpdatePromptRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt": {
|
||||
"type": "string",
|
||||
"description": "The updated prompt text content."
|
||||
},
|
||||
"version": {
|
||||
"type": "integer",
|
||||
"description": "The current version of the prompt being updated."
|
||||
},
|
||||
"variables": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Updated list of variable names that can be used in the prompt template."
|
||||
},
|
||||
"set_as_default": {
|
||||
"type": "boolean",
|
||||
"description": "Set the new version as the default (default=True)."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"prompt",
|
||||
"version",
|
||||
"set_as_default"
|
||||
],
|
||||
"title": "UpdatePromptRequest"
|
||||
},
|
||||
"VersionInfo": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -17538,6 +17974,10 @@
|
|||
{
|
||||
"name": "PostTraining (Coming Soon)"
|
||||
},
|
||||
{
|
||||
"name": "Prompts",
|
||||
"x-displayName": "Protocol for prompt management operations."
|
||||
},
|
||||
{
|
||||
"name": "Providers",
|
||||
"x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
|
||||
|
@ -17588,6 +18028,7 @@
|
|||
"Inspect",
|
||||
"Models",
|
||||
"PostTraining (Coming Soon)",
|
||||
"Prompts",
|
||||
"Providers",
|
||||
"Safety",
|
||||
"Scoring",
|
||||
|
|
332
docs/_static/llama-stack-spec.yaml
vendored
332
docs/_static/llama-stack-spec.yaml
vendored
|
@ -427,6 +427,58 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/CreateOpenaiResponseRequest'
|
||||
required: true
|
||||
/v1/prompts:
|
||||
get:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
A ListPromptsResponse containing all prompts.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ListPromptsResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Prompts
|
||||
description: List all prompts.
|
||||
parameters: []
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: The created Prompt resource.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/Prompt'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Prompts
|
||||
description: Create a new prompt.
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/CreatePromptRequest'
|
||||
required: true
|
||||
/v1/agents/{agent_id}:
|
||||
get:
|
||||
responses:
|
||||
|
@ -616,6 +668,103 @@ paths:
|
|||
required: true
|
||||
schema:
|
||||
type: string
|
||||
/v1/prompts/{prompt_id}:
|
||||
get:
|
||||
responses:
|
||||
'200':
|
||||
description: A Prompt resource.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/Prompt'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Prompts
|
||||
description: >-
|
||||
Get a prompt by its identifier and optional version.
|
||||
parameters:
|
||||
- name: prompt_id
|
||||
in: path
|
||||
description: The identifier of the prompt to get.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: version
|
||||
in: query
|
||||
description: >-
|
||||
The version of the prompt to get (defaults to latest).
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
The updated Prompt resource with incremented version.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/Prompt'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Prompts
|
||||
description: >-
|
||||
Update an existing prompt (increments version).
|
||||
parameters:
|
||||
- name: prompt_id
|
||||
in: path
|
||||
description: The identifier of the prompt to update.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/UpdatePromptRequest'
|
||||
required: true
|
||||
delete:
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Prompts
|
||||
description: Delete a prompt.
|
||||
parameters:
|
||||
- name: prompt_id
|
||||
in: path
|
||||
description: The identifier of the prompt to delete.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
/v1/inference/embeddings:
|
||||
post:
|
||||
responses:
|
||||
|
@ -1983,6 +2132,37 @@ paths:
|
|||
required: false
|
||||
schema:
|
||||
$ref: '#/components/schemas/Order'
|
||||
/v1/prompts/{prompt_id}/versions:
|
||||
get:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
A ListPromptsResponse containing all versions of the prompt.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ListPromptsResponse'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Prompts
|
||||
description: List all versions of a specific prompt.
|
||||
parameters:
|
||||
- name: prompt_id
|
||||
in: path
|
||||
description: >-
|
||||
The identifier of the prompt to list versions for.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
/v1/providers:
|
||||
get:
|
||||
responses:
|
||||
|
@ -3546,6 +3726,43 @@ paths:
|
|||
schema:
|
||||
$ref: '#/components/schemas/ScoreBatchRequest'
|
||||
required: true
|
||||
/v1/prompts/{prompt_id}/set-default-version:
|
||||
post:
|
||||
responses:
|
||||
'200':
|
||||
description: >-
|
||||
The prompt with the specified version now set as default.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/Prompt'
|
||||
'400':
|
||||
$ref: '#/components/responses/BadRequest400'
|
||||
'429':
|
||||
$ref: >-
|
||||
#/components/responses/TooManyRequests429
|
||||
'500':
|
||||
$ref: >-
|
||||
#/components/responses/InternalServerError500
|
||||
default:
|
||||
$ref: '#/components/responses/DefaultError'
|
||||
tags:
|
||||
- Prompts
|
||||
description: >-
|
||||
Set which version of a prompt should be the default in get_prompt (latest).
|
||||
parameters:
|
||||
- name: prompt_id
|
||||
in: path
|
||||
description: The identifier of the prompt.
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/SetDefaultVersionRequest'
|
||||
required: true
|
||||
/v1/post-training/supervised-fine-tune:
|
||||
post:
|
||||
responses:
|
||||
|
@ -7148,6 +7365,61 @@ components:
|
|||
- type
|
||||
title: >-
|
||||
OpenAIResponseObjectStreamResponseWebSearchCallSearching
|
||||
CreatePromptRequest:
|
||||
type: object
|
||||
properties:
|
||||
prompt:
|
||||
type: string
|
||||
description: >-
|
||||
The prompt text content with variable placeholders.
|
||||
variables:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: >-
|
||||
List of variable names that can be used in the prompt template.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- prompt
|
||||
title: CreatePromptRequest
|
||||
Prompt:
|
||||
type: object
|
||||
properties:
|
||||
prompt:
|
||||
type: string
|
||||
description: >-
|
||||
The system prompt text with variable placeholders. Variables are only
|
||||
supported when using the Responses API.
|
||||
version:
|
||||
type: integer
|
||||
description: >-
|
||||
Version (integer starting at 1, incremented on save)
|
||||
prompt_id:
|
||||
type: string
|
||||
description: >-
|
||||
Unique identifier formatted as 'pmpt_<48-digit-hash>'
|
||||
variables:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: >-
|
||||
List of prompt variable names that can be used in the prompt template
|
||||
is_default:
|
||||
type: boolean
|
||||
default: false
|
||||
description: >-
|
||||
Boolean indicating whether this version is the default version for this
|
||||
prompt
|
||||
additionalProperties: false
|
||||
required:
|
||||
- version
|
||||
- prompt_id
|
||||
- variables
|
||||
- is_default
|
||||
title: Prompt
|
||||
description: >-
|
||||
A prompt resource representing a stored OpenAI Compatible prompt template
|
||||
in Llama Stack.
|
||||
OpenAIDeleteResponseObject:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -7621,6 +7893,7 @@ components:
|
|||
- benchmark
|
||||
- tool
|
||||
- tool_group
|
||||
- prompt
|
||||
const: benchmark
|
||||
default: benchmark
|
||||
description: The resource type, always benchmark
|
||||
|
@ -8107,6 +8380,7 @@ components:
|
|||
- benchmark
|
||||
- tool
|
||||
- tool_group
|
||||
- prompt
|
||||
const: dataset
|
||||
default: dataset
|
||||
description: >-
|
||||
|
@ -8219,6 +8493,7 @@ components:
|
|||
- benchmark
|
||||
- tool
|
||||
- tool_group
|
||||
- prompt
|
||||
const: model
|
||||
default: model
|
||||
description: >-
|
||||
|
@ -8410,6 +8685,7 @@ components:
|
|||
- benchmark
|
||||
- tool
|
||||
- tool_group
|
||||
- prompt
|
||||
const: scoring_function
|
||||
default: scoring_function
|
||||
description: >-
|
||||
|
@ -8486,6 +8762,7 @@ components:
|
|||
- benchmark
|
||||
- tool
|
||||
- tool_group
|
||||
- prompt
|
||||
const: shield
|
||||
default: shield
|
||||
description: The resource type, always shield
|
||||
|
@ -8665,6 +8942,7 @@ components:
|
|||
- benchmark
|
||||
- tool
|
||||
- tool_group
|
||||
- prompt
|
||||
const: tool
|
||||
default: tool
|
||||
description: Type of resource, always 'tool'
|
||||
|
@ -8723,6 +9001,7 @@ components:
|
|||
- benchmark
|
||||
- tool
|
||||
- tool_group
|
||||
- prompt
|
||||
const: tool_group
|
||||
default: tool_group
|
||||
description: Type of resource, always 'tool_group'
|
||||
|
@ -8951,6 +9230,7 @@ components:
|
|||
- benchmark
|
||||
- tool
|
||||
- tool_group
|
||||
- prompt
|
||||
const: vector_db
|
||||
default: vector_db
|
||||
description: >-
|
||||
|
@ -9577,6 +9857,18 @@ components:
|
|||
title: OpenAIResponseObjectWithInput
|
||||
description: >-
|
||||
OpenAI response object extended with input context information.
|
||||
ListPromptsResponse:
|
||||
type: object
|
||||
properties:
|
||||
data:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/Prompt'
|
||||
additionalProperties: false
|
||||
required:
|
||||
- data
|
||||
title: ListPromptsResponse
|
||||
description: Response model to list prompts.
|
||||
ListProvidersResponse:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -12723,6 +13015,16 @@ components:
|
|||
title: ScoreBatchResponse
|
||||
description: >-
|
||||
Response from batch scoring operations on datasets.
|
||||
SetDefaultVersionRequest:
|
||||
type: object
|
||||
properties:
|
||||
version:
|
||||
type: integer
|
||||
description: The version to set as default.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- version
|
||||
title: SetDefaultVersionRequest
|
||||
AlgorithmConfig:
|
||||
oneOf:
|
||||
- $ref: '#/components/schemas/LoraFinetuningConfig'
|
||||
|
@ -12919,6 +13221,32 @@ components:
|
|||
description: >-
|
||||
Response from the synthetic data generation. Batch of (prompt, response, score)
|
||||
tuples that pass the threshold.
|
||||
UpdatePromptRequest:
|
||||
type: object
|
||||
properties:
|
||||
prompt:
|
||||
type: string
|
||||
description: The updated prompt text content.
|
||||
version:
|
||||
type: integer
|
||||
description: >-
|
||||
The current version of the prompt being updated.
|
||||
variables:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
description: >-
|
||||
Updated list of variable names that can be used in the prompt template.
|
||||
set_as_default:
|
||||
type: boolean
|
||||
description: >-
|
||||
Set the new version as the default (default=True).
|
||||
additionalProperties: false
|
||||
required:
|
||||
- prompt
|
||||
- version
|
||||
- set_as_default
|
||||
title: UpdatePromptRequest
|
||||
VersionInfo:
|
||||
type: object
|
||||
properties:
|
||||
|
@ -13030,6 +13358,9 @@ tags:
|
|||
- name: Inspect
|
||||
- name: Models
|
||||
- name: PostTraining (Coming Soon)
|
||||
- name: Prompts
|
||||
x-displayName: >-
|
||||
Protocol for prompt management operations.
|
||||
- name: Providers
|
||||
x-displayName: >-
|
||||
Providers API for inspecting, listing, and modifying providers and their configurations.
|
||||
|
@ -13057,6 +13388,7 @@ x-tagGroups:
|
|||
- Inspect
|
||||
- Models
|
||||
- PostTraining (Coming Soon)
|
||||
- Prompts
|
||||
- Providers
|
||||
- Safety
|
||||
- Scoring
|
||||
|
|
|
@ -131,6 +131,7 @@ html_static_path = ["../_static"]
|
|||
def setup(app):
|
||||
app.add_css_file("css/my_theme.css")
|
||||
app.add_js_file("js/detect_theme.js")
|
||||
app.add_js_file("js/horizontal_nav.js")
|
||||
app.add_js_file("js/keyboard_shortcuts.js")
|
||||
|
||||
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||
|
|
|
@ -35,5 +35,5 @@ testing/record-replay
|
|||
|
||||
### Benchmarking
|
||||
|
||||
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
|
||||
```{include} ../../../benchmarking/k8s-benchmark/README.md
|
||||
```
|
||||
|
|
|
@ -40,18 +40,15 @@ The system patches OpenAI and Ollama client methods to intercept calls before th
|
|||
|
||||
### Storage Architecture
|
||||
|
||||
Recordings use a two-tier storage system optimized for both speed and debuggability:
|
||||
Recordings are stored as JSON files in the recording directory. They are looked up by their request hash.
|
||||
|
||||
```
|
||||
recordings/
|
||||
├── index.sqlite # Fast lookup by request hash
|
||||
└── responses/
|
||||
├── abc123def456.json # Individual response files
|
||||
└── def789ghi012.json
|
||||
```
|
||||
|
||||
**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
|
||||
|
||||
**JSON files** store complete request/response pairs in human-readable format for debugging.
|
||||
|
||||
## Recording Modes
|
||||
|
@ -166,8 +163,8 @@ This preserves type safety - when replayed, you get the same Pydantic objects wi
|
|||
Control recording behavior globally:
|
||||
|
||||
```bash
|
||||
export LLAMA_STACK_TEST_INFERENCE_MODE=replay
|
||||
export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
|
||||
export LLAMA_STACK_TEST_INFERENCE_MODE=replay # this is the default
|
||||
export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings # default is tests/integration/recordings
|
||||
pytest tests/integration/
|
||||
```
|
||||
|
||||
|
|
|
@ -354,6 +354,47 @@ You can easily validate a request by running:
|
|||
curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
|
||||
```
|
||||
|
||||
#### Kubernetes Authentication Provider
|
||||
|
||||
The server can be configured to use Kubernetes SelfSubjectReview API to validate tokens directly against the Kubernetes API server:
|
||||
|
||||
```yaml
|
||||
server:
|
||||
auth:
|
||||
provider_config:
|
||||
type: "kubernetes"
|
||||
api_server_url: "https://kubernetes.default.svc"
|
||||
claims_mapping:
|
||||
username: "roles"
|
||||
groups: "roles"
|
||||
uid: "uid_attr"
|
||||
verify_tls: true
|
||||
tls_cafile: "/path/to/ca.crt"
|
||||
```
|
||||
|
||||
Configuration options:
|
||||
- `api_server_url`: The Kubernetes API server URL (e.g., https://kubernetes.default.svc:6443)
|
||||
- `verify_tls`: Whether to verify TLS certificates (default: true)
|
||||
- `tls_cafile`: Path to CA certificate file for TLS verification
|
||||
- `claims_mapping`: Mapping of Kubernetes user claims to access attributes
|
||||
|
||||
The provider validates tokens by sending a SelfSubjectReview request to the Kubernetes API server at `/apis/authentication.k8s.io/v1/selfsubjectreviews`. The provider extracts user information from the response:
|
||||
- Username from the `userInfo.username` field
|
||||
- Groups from the `userInfo.groups` field
|
||||
- UID from the `userInfo.uid` field
|
||||
|
||||
To obtain a token for testing:
|
||||
```bash
|
||||
kubectl create namespace llama-stack
|
||||
kubectl create serviceaccount llama-stack-auth -n llama-stack
|
||||
kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
|
||||
```
|
||||
|
||||
You can validate a request by running:
|
||||
```bash
|
||||
curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
|
||||
```
|
||||
|
||||
#### GitHub Token Provider
|
||||
Validates GitHub personal access tokens or OAuth tokens directly:
|
||||
```yaml
|
||||
|
|
|
@ -1,156 +0,0 @@
|
|||
# Llama Stack Benchmark Suite on Kubernetes
|
||||
|
||||
## Motivation
|
||||
|
||||
Performance benchmarking is critical for understanding the overhead and characteristics of the Llama Stack abstraction layer compared to direct inference engines like vLLM.
|
||||
|
||||
### Why This Benchmark Suite Exists
|
||||
|
||||
**Performance Validation**: The Llama Stack provides a unified API layer across multiple inference providers, but this abstraction introduces potential overhead. This benchmark suite quantifies the performance impact by comparing:
|
||||
- Llama Stack inference (with vLLM backend)
|
||||
- Direct vLLM inference calls
|
||||
- Both under identical Kubernetes deployment conditions
|
||||
|
||||
**Production Readiness Assessment**: Real-world deployments require understanding performance characteristics under load. This suite simulates concurrent user scenarios with configurable parameters (duration, concurrency, request patterns) to validate production readiness.
|
||||
|
||||
**Regression Detection (TODO)**: As the Llama Stack evolves, this benchmark provides automated regression detection for performance changes. CI/CD pipelines can leverage these benchmarks to catch performance degradations before production deployments.
|
||||
|
||||
**Resource Planning**: By measuring throughput, latency percentiles, and resource utilization patterns, teams can make informed decisions about:
|
||||
- Kubernetes resource allocation (CPU, memory, GPU)
|
||||
- Auto-scaling configurations
|
||||
- Cost optimization strategies
|
||||
|
||||
### Key Metrics Captured
|
||||
|
||||
The benchmark suite measures critical performance indicators:
|
||||
- **Throughput**: Requests per second under sustained load
|
||||
- **Latency Distribution**: P50, P95, P99 response times
|
||||
- **Time to First Token (TTFT)**: Critical for streaming applications
|
||||
- **Error Rates**: Request failures and timeout analysis
|
||||
|
||||
This data enables data-driven architectural decisions and performance optimization efforts.
|
||||
|
||||
## Setup
|
||||
|
||||
**1. Deploy base k8s infrastructure:**
|
||||
```bash
|
||||
cd ../k8s
|
||||
./apply.sh
|
||||
```
|
||||
|
||||
**2. Deploy benchmark components:**
|
||||
```bash
|
||||
cd ../k8s-benchmark
|
||||
./apply.sh
|
||||
```
|
||||
|
||||
**3. Verify deployment:**
|
||||
```bash
|
||||
kubectl get pods
|
||||
# Should see: llama-stack-benchmark-server, vllm-server, etc.
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Benchmarks
|
||||
|
||||
**Benchmark Llama Stack (default):**
|
||||
```bash
|
||||
cd docs/source/distributions/k8s-benchmark/
|
||||
./run-benchmark.sh
|
||||
```
|
||||
|
||||
**Benchmark vLLM direct:**
|
||||
```bash
|
||||
./run-benchmark.sh --target vllm
|
||||
```
|
||||
|
||||
### Custom Configuration
|
||||
|
||||
**Extended benchmark with high concurrency:**
|
||||
```bash
|
||||
./run-benchmark.sh --target vllm --duration 120 --concurrent 20
|
||||
```
|
||||
|
||||
**Short test run:**
|
||||
```bash
|
||||
./run-benchmark.sh --target stack --duration 30 --concurrent 5
|
||||
```
|
||||
|
||||
## Command Reference
|
||||
|
||||
### run-benchmark.sh Options
|
||||
|
||||
```bash
|
||||
./run-benchmark.sh [options]
|
||||
|
||||
Options:
|
||||
-t, --target <stack|vllm> Target to benchmark (default: stack)
|
||||
-d, --duration <seconds> Duration in seconds (default: 60)
|
||||
-c, --concurrent <users> Number of concurrent users (default: 10)
|
||||
-h, --help Show help message
|
||||
|
||||
Examples:
|
||||
./run-benchmark.sh --target vllm # Benchmark vLLM direct
|
||||
./run-benchmark.sh --target stack # Benchmark Llama Stack
|
||||
./run-benchmark.sh -t vllm -d 120 -c 20 # vLLM with 120s, 20 users
|
||||
```
|
||||
|
||||
## Local Testing
|
||||
|
||||
### Running Benchmark Locally
|
||||
|
||||
For local development without Kubernetes:
|
||||
|
||||
**1. Start OpenAI mock server:**
|
||||
```bash
|
||||
uv run python openai-mock-server.py --port 8080
|
||||
```
|
||||
|
||||
**2. Run benchmark against mock server:**
|
||||
```bash
|
||||
uv run python benchmark.py \
|
||||
--base-url http://localhost:8080/v1 \
|
||||
--model mock-inference \
|
||||
--duration 30 \
|
||||
--concurrent 5
|
||||
```
|
||||
|
||||
**3. Test against local vLLM server:**
|
||||
```bash
|
||||
# If you have vLLM running locally on port 8000
|
||||
uv run python benchmark.py \
|
||||
--base-url http://localhost:8000/v1 \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||
--duration 30 \
|
||||
--concurrent 5
|
||||
```
|
||||
|
||||
**4. Profile the running server:**
|
||||
```bash
|
||||
./profile_running_server.sh
|
||||
```
|
||||
|
||||
|
||||
|
||||
### OpenAI Mock Server
|
||||
|
||||
The `openai-mock-server.py` provides:
|
||||
- **OpenAI-compatible API** for testing without real models
|
||||
- **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
|
||||
- **Consistent responses** for reproducible benchmarks
|
||||
- **Lightweight testing** without GPU requirements
|
||||
|
||||
**Mock server usage:**
|
||||
```bash
|
||||
uv run python openai-mock-server.py --port 8080
|
||||
```
|
||||
|
||||
The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
|
||||
|
||||
## Files in this Directory
|
||||
|
||||
- `benchmark.py` - Core benchmark script with async streaming support
|
||||
- `run-benchmark.sh` - Main script with target selection and configuration
|
||||
- `openai-mock-server.py` - Mock OpenAI API server for local testing
|
||||
- `README.md` - This documentation file
|
|
@ -1,36 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
|
||||
|
||||
export STREAM_DELAY_SECONDS=0.005
|
||||
|
||||
export POSTGRES_USER=llamastack
|
||||
export POSTGRES_DB=llamastack
|
||||
export POSTGRES_PASSWORD=llamastack
|
||||
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
|
||||
export MOCK_INFERENCE_MODEL=mock-inference
|
||||
|
||||
export MOCK_INFERENCE_URL=openai-mock-service:8080
|
||||
|
||||
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
|
||||
set -euo pipefail
|
||||
set -x
|
||||
|
||||
# Deploy benchmark-specific components
|
||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||
--dry-run=client -o yaml > stack-configmap.yaml
|
||||
|
||||
kubectl apply --validate=false -f stack-configmap.yaml
|
||||
|
||||
# Deploy our custom llama stack server (overriding the base one)
|
||||
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
|
|
@ -1,267 +0,0 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
"""
|
||||
Simple benchmark script for Llama Stack with OpenAI API compatibility.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import random
|
||||
import statistics
|
||||
import time
|
||||
from typing import Tuple
|
||||
import aiohttp
|
||||
|
||||
|
||||
class BenchmarkStats:
|
||||
def __init__(self):
|
||||
self.response_times = []
|
||||
self.ttft_times = []
|
||||
self.chunks_received = []
|
||||
self.errors = []
|
||||
self.success_count = 0
|
||||
self.total_requests = 0
|
||||
self.concurrent_users = 0
|
||||
self.start_time = None
|
||||
self.end_time = None
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
|
||||
async with self._lock:
|
||||
self.total_requests += 1
|
||||
if error:
|
||||
self.errors.append(error)
|
||||
else:
|
||||
self.success_count += 1
|
||||
self.response_times.append(response_time)
|
||||
self.chunks_received.append(chunks)
|
||||
if ttft is not None:
|
||||
self.ttft_times.append(ttft)
|
||||
|
||||
def print_summary(self):
|
||||
if not self.response_times:
|
||||
print("No successful requests to report")
|
||||
if self.errors:
|
||||
print(f"Total errors: {len(self.errors)}")
|
||||
print("First 5 errors:")
|
||||
for error in self.errors[:5]:
|
||||
print(f" {error}")
|
||||
return
|
||||
|
||||
total_time = self.end_time - self.start_time
|
||||
success_rate = (self.success_count / self.total_requests) * 100
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"BENCHMARK RESULTS")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total time: {total_time:.2f}s")
|
||||
print(f"Concurrent users: {self.concurrent_users}")
|
||||
print(f"Total requests: {self.total_requests}")
|
||||
print(f"Successful requests: {self.success_count}")
|
||||
print(f"Failed requests: {len(self.errors)}")
|
||||
print(f"Success rate: {success_rate:.1f}%")
|
||||
print(f"Requests per second: {self.success_count / total_time:.2f}")
|
||||
|
||||
print(f"\nResponse Time Statistics:")
|
||||
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
|
||||
print(f" Median: {statistics.median(self.response_times):.3f}s")
|
||||
print(f" Min: {min(self.response_times):.3f}s")
|
||||
print(f" Max: {max(self.response_times):.3f}s")
|
||||
|
||||
if len(self.response_times) > 1:
|
||||
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
|
||||
|
||||
percentiles = [50, 90, 95, 99]
|
||||
sorted_times = sorted(self.response_times)
|
||||
print(f"\nPercentiles:")
|
||||
for p in percentiles:
|
||||
idx = int(len(sorted_times) * p / 100) - 1
|
||||
idx = max(0, min(idx, len(sorted_times) - 1))
|
||||
print(f" P{p}: {sorted_times[idx]:.3f}s")
|
||||
|
||||
if self.ttft_times:
|
||||
print(f"\nTime to First Token (TTFT) Statistics:")
|
||||
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
|
||||
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
|
||||
print(f" Min: {min(self.ttft_times):.3f}s")
|
||||
print(f" Max: {max(self.ttft_times):.3f}s")
|
||||
|
||||
if len(self.ttft_times) > 1:
|
||||
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
|
||||
|
||||
sorted_ttft = sorted(self.ttft_times)
|
||||
print(f"\nTTFT Percentiles:")
|
||||
for p in percentiles:
|
||||
idx = int(len(sorted_ttft) * p / 100) - 1
|
||||
idx = max(0, min(idx, len(sorted_ttft) - 1))
|
||||
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
|
||||
|
||||
if self.chunks_received:
|
||||
print(f"\nStreaming Statistics:")
|
||||
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
|
||||
print(f" Total chunks received: {sum(self.chunks_received)}")
|
||||
|
||||
if self.errors:
|
||||
print(f"\nErrors (showing first 5):")
|
||||
for error in self.errors[:5]:
|
||||
print(f" {error}")
|
||||
|
||||
|
||||
class LlamaStackBenchmark:
|
||||
def __init__(self, base_url: str, model_id: str):
|
||||
self.base_url = base_url.rstrip('/')
|
||||
self.model_id = model_id
|
||||
self.headers = {"Content-Type": "application/json"}
|
||||
self.test_messages = [
|
||||
[{"role": "user", "content": "Hi"}],
|
||||
[{"role": "user", "content": "What is the capital of France?"}],
|
||||
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
|
||||
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
|
||||
[
|
||||
{"role": "user", "content": "What is machine learning?"},
|
||||
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
||||
{"role": "user", "content": "Can you give me a practical example?"}
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
|
||||
"""Make a single async streaming chat completion request."""
|
||||
messages = random.choice(self.test_messages)
|
||||
payload = {
|
||||
"model": self.model_id,
|
||||
"messages": messages,
|
||||
"stream": True,
|
||||
"max_tokens": 100
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
chunks_received = 0
|
||||
ttft = None
|
||||
error = None
|
||||
|
||||
session = aiohttp.ClientSession()
|
||||
|
||||
try:
|
||||
async with session.post(
|
||||
f"{self.base_url}/chat/completions",
|
||||
headers=self.headers,
|
||||
json=payload,
|
||||
timeout=aiohttp.ClientTimeout(total=30)
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
async for line in response.content:
|
||||
if line:
|
||||
line_str = line.decode('utf-8').strip()
|
||||
if line_str.startswith('data: '):
|
||||
chunks_received += 1
|
||||
if ttft is None:
|
||||
ttft = time.time() - start_time
|
||||
if line_str == 'data: [DONE]':
|
||||
break
|
||||
|
||||
if chunks_received == 0:
|
||||
error = "No streaming chunks received"
|
||||
else:
|
||||
text = await response.text()
|
||||
error = f"HTTP {response.status}: {text[:100]}"
|
||||
|
||||
except Exception as e:
|
||||
error = f"Request error: {str(e)}"
|
||||
finally:
|
||||
await session.close()
|
||||
|
||||
response_time = time.time() - start_time
|
||||
return response_time, chunks_received, ttft, error
|
||||
|
||||
|
||||
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
|
||||
"""Run benchmark using async requests for specified duration."""
|
||||
stats = BenchmarkStats()
|
||||
stats.concurrent_users = concurrent_users
|
||||
stats.start_time = time.time()
|
||||
|
||||
print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
|
||||
print(f"Target URL: {self.base_url}/chat/completions")
|
||||
print(f"Model: {self.model_id}")
|
||||
|
||||
connector = aiohttp.TCPConnector(limit=concurrent_users)
|
||||
async with aiohttp.ClientSession(connector=connector) as session:
|
||||
|
||||
async def worker(worker_id: int):
|
||||
"""Worker that sends requests sequentially until canceled."""
|
||||
request_count = 0
|
||||
while True:
|
||||
try:
|
||||
response_time, chunks, ttft, error = await self.make_async_streaming_request()
|
||||
await stats.add_result(response_time, chunks, ttft, error)
|
||||
request_count += 1
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
|
||||
|
||||
# Progress reporting task
|
||||
async def progress_reporter():
|
||||
last_report_time = time.time()
|
||||
while True:
|
||||
try:
|
||||
await asyncio.sleep(1) # Report every second
|
||||
if time.time() >= last_report_time + 10: # Report every 10 seconds
|
||||
elapsed = time.time() - stats.start_time
|
||||
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
|
||||
last_report_time = time.time()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
|
||||
# Spawn concurrent workers
|
||||
tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
|
||||
progress_task = asyncio.create_task(progress_reporter())
|
||||
tasks.append(progress_task)
|
||||
|
||||
# Wait for duration then cancel all tasks
|
||||
await asyncio.sleep(duration)
|
||||
|
||||
for task in tasks:
|
||||
task.cancel()
|
||||
|
||||
# Wait for all tasks to complete
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
stats.end_time = time.time()
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
|
||||
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
|
||||
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
|
||||
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
|
||||
help="Model ID to use for requests")
|
||||
parser.add_argument("--duration", type=int, default=60,
|
||||
help="Duration in seconds to run benchmark (default: 60)")
|
||||
parser.add_argument("--concurrent", type=int, default=10,
|
||||
help="Number of concurrent users (default: 10)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
benchmark = LlamaStackBenchmark(args.base_url, args.model)
|
||||
|
||||
try:
|
||||
stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
|
||||
stats.print_summary()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nBenchmark interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"Benchmark failed: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,190 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
"""
|
||||
OpenAI-compatible mock server that returns:
|
||||
- Hardcoded /models response for consistent validation
|
||||
- Valid OpenAI-formatted chat completion responses with dynamic content
|
||||
"""
|
||||
|
||||
from flask import Flask, request, jsonify, Response
|
||||
import time
|
||||
import random
|
||||
import uuid
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Models from environment variables
|
||||
def get_models():
|
||||
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
|
||||
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": model_id,
|
||||
"object": "model",
|
||||
"created": 1234567890,
|
||||
"owned_by": "vllm"
|
||||
}
|
||||
for model_id in model_ids
|
||||
]
|
||||
}
|
||||
|
||||
def generate_random_text(length=50):
|
||||
"""Generate random but coherent text for responses."""
|
||||
words = [
|
||||
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
|
||||
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
|
||||
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
|
||||
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
|
||||
]
|
||||
return " ".join(random.choices(words, k=length))
|
||||
|
||||
@app.route('/v1/models', methods=['GET'])
|
||||
def list_models():
|
||||
models = get_models()
|
||||
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
||||
return jsonify(models)
|
||||
|
||||
@app.route('/v1/chat/completions', methods=['POST'])
|
||||
def chat_completions():
|
||||
"""Return OpenAI-formatted chat completion responses."""
|
||||
data = request.get_json()
|
||||
default_model = get_models()['data'][0]['id']
|
||||
model = data.get('model', default_model)
|
||||
messages = data.get('messages', [])
|
||||
stream = data.get('stream', False)
|
||||
|
||||
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
|
||||
|
||||
if stream:
|
||||
return handle_streaming_completion(model, messages)
|
||||
else:
|
||||
return handle_non_streaming_completion(model, messages)
|
||||
|
||||
def handle_non_streaming_completion(model, messages):
|
||||
response_text = generate_random_text(random.randint(20, 80))
|
||||
|
||||
# Calculate realistic token counts
|
||||
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
|
||||
completion_tokens = len(response_text.split())
|
||||
|
||||
response = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": response_text
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"total_tokens": prompt_tokens + completion_tokens
|
||||
}
|
||||
}
|
||||
|
||||
return jsonify(response)
|
||||
|
||||
def handle_streaming_completion(model, messages):
|
||||
def generate_stream():
|
||||
# Generate response text
|
||||
full_response = generate_random_text(random.randint(30, 100))
|
||||
words = full_response.split()
|
||||
|
||||
# Send initial chunk
|
||||
initial_chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"role": "assistant", "content": ""}
|
||||
}
|
||||
]
|
||||
}
|
||||
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
||||
|
||||
# Send word by word
|
||||
for i, word in enumerate(words):
|
||||
chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
|
||||
}
|
||||
]
|
||||
}
|
||||
yield f"data: {json.dumps(chunk)}\n\n"
|
||||
# Configurable delay to simulate realistic streaming
|
||||
stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
|
||||
time.sleep(stream_delay)
|
||||
|
||||
# Send final chunk
|
||||
final_chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": ""},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
]
|
||||
}
|
||||
yield f"data: {json.dumps(final_chunk)}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
return Response(
|
||||
generate_stream(),
|
||||
mimetype='text/event-stream',
|
||||
headers={
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Access-Control-Allow-Origin': '*',
|
||||
}
|
||||
)
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health():
|
||||
return jsonify({"status": "healthy", "type": "openai-mock"})
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
|
||||
parser.add_argument('--port', type=int, default=8081,
|
||||
help='Port to run the server on (default: 8081)')
|
||||
args = parser.parse_args()
|
||||
|
||||
port = args.port
|
||||
|
||||
models = get_models()
|
||||
print("Starting OpenAI-compatible mock server...")
|
||||
print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
|
||||
print("- OpenAI-formatted chat/completion responses with dynamic content")
|
||||
print("- Streaming support with valid SSE format")
|
||||
print(f"- Listening on: http://0.0.0.0:{port}")
|
||||
app.run(host='0.0.0.0', port=port, debug=False)
|
|
@ -1,52 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# Script to profile an already running Llama Stack server
|
||||
# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
|
||||
|
||||
DURATION=${1:-60} # Default 60 seconds
|
||||
OUTPUT_FILE=${2:-"llama_stack_profile"} # Default output file
|
||||
|
||||
echo "Looking for running Llama Stack server..."
|
||||
|
||||
# Find the server PID
|
||||
SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
|
||||
|
||||
|
||||
if [ -z "$SERVER_PID" ]; then
|
||||
echo "Error: No running Llama Stack server found"
|
||||
echo "Please start your server first with:"
|
||||
echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Found Llama Stack server with PID: $SERVER_PID"
|
||||
|
||||
# Start py-spy profiling
|
||||
echo "Starting py-spy profiling for ${DURATION} seconds..."
|
||||
echo "Output will be saved to: ${OUTPUT_FILE}.svg"
|
||||
echo ""
|
||||
echo "You can now run your load test..."
|
||||
echo ""
|
||||
|
||||
# Get the full path to py-spy
|
||||
PYSPY_PATH=$(which py-spy)
|
||||
|
||||
# Check if running as root, if not, use sudo
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
echo "py-spy requires root permissions on macOS. Running with sudo..."
|
||||
sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
||||
else
|
||||
"$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
|
||||
echo ""
|
||||
echo "To view the flame graph:"
|
||||
echo "open ${OUTPUT_FILE}.svg"
|
|
@ -1,148 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Default values
|
||||
TARGET="stack"
|
||||
DURATION=60
|
||||
CONCURRENT=10
|
||||
|
||||
# Parse command line arguments
|
||||
usage() {
|
||||
echo "Usage: $0 [options]"
|
||||
echo "Options:"
|
||||
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
|
||||
echo " -d, --duration <seconds> Duration in seconds (default: 60)"
|
||||
echo " -c, --concurrent <users> Number of concurrent users (default: 10)"
|
||||
echo " -h, --help Show this help message"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " $0 --target vllm # Benchmark vLLM direct"
|
||||
echo " $0 --target stack # Benchmark Llama Stack (default)"
|
||||
echo " $0 -t vllm -d 120 -c 20 # vLLM with 120s duration, 20 users"
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-t|--target)
|
||||
TARGET="$2"
|
||||
shift 2
|
||||
;;
|
||||
-d|--duration)
|
||||
DURATION="$2"
|
||||
shift 2
|
||||
;;
|
||||
-c|--concurrent)
|
||||
CONCURRENT="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate target
|
||||
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
|
||||
echo "Error: Target must be 'stack' or 'vllm'"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Set configuration based on target
|
||||
if [[ "$TARGET" == "vllm" ]]; then
|
||||
BASE_URL="http://vllm-server:8000/v1"
|
||||
JOB_NAME="vllm-benchmark-job"
|
||||
echo "Benchmarking vLLM direct..."
|
||||
else
|
||||
BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
|
||||
JOB_NAME="stack-benchmark-job"
|
||||
echo "Benchmarking Llama Stack..."
|
||||
fi
|
||||
|
||||
echo "Configuration:"
|
||||
echo " Target: $TARGET"
|
||||
echo " Base URL: $BASE_URL"
|
||||
echo " Duration: ${DURATION}s"
|
||||
echo " Concurrent users: $CONCURRENT"
|
||||
echo ""
|
||||
|
||||
# Create temporary job yaml
|
||||
TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
|
||||
cat > "$TEMP_YAML" << EOF
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: $JOB_NAME
|
||||
namespace: default
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: benchmark
|
||||
image: python:3.11-slim
|
||||
command: ["/bin/bash"]
|
||||
args:
|
||||
- "-c"
|
||||
- |
|
||||
pip install aiohttp &&
|
||||
python3 /benchmark/benchmark.py \\
|
||||
--base-url $BASE_URL \\
|
||||
--model \${INFERENCE_MODEL} \\
|
||||
--duration $DURATION \\
|
||||
--concurrent $CONCURRENT
|
||||
env:
|
||||
- name: INFERENCE_MODEL
|
||||
value: "meta-llama/Llama-3.2-3B-Instruct"
|
||||
volumeMounts:
|
||||
- name: benchmark-script
|
||||
mountPath: /benchmark
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
volumes:
|
||||
- name: benchmark-script
|
||||
configMap:
|
||||
name: benchmark-script
|
||||
restartPolicy: Never
|
||||
backoffLimit: 3
|
||||
EOF
|
||||
|
||||
echo "Creating benchmark ConfigMap..."
|
||||
kubectl create configmap benchmark-script \
|
||||
--from-file=benchmark.py=benchmark.py \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
echo "Cleaning up any existing benchmark job..."
|
||||
kubectl delete job $JOB_NAME 2>/dev/null || true
|
||||
|
||||
echo "Deploying benchmark Job..."
|
||||
kubectl apply -f "$TEMP_YAML"
|
||||
|
||||
echo "Waiting for job to start..."
|
||||
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
|
||||
|
||||
echo "Following benchmark logs..."
|
||||
kubectl logs -f job/$JOB_NAME
|
||||
|
||||
echo "Job completed. Checking final status..."
|
||||
kubectl get job $JOB_NAME
|
||||
|
||||
# Clean up temporary file
|
||||
rm -f "$TEMP_YAML"
|
|
@ -1,133 +0,0 @@
|
|||
apiVersion: v1
|
||||
data:
|
||||
stack_run_config.yaml: |
|
||||
version: '2'
|
||||
image_name: kubernetes-benchmark-demo
|
||||
apis:
|
||||
- agents
|
||||
- inference
|
||||
- safety
|
||||
- telemetry
|
||||
- tool_runtime
|
||||
- vector_io
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: vllm-safety
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
vector_io:
|
||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
||||
provider_type: remote::chromadb
|
||||
config:
|
||||
url: ${env.CHROMADB_URL:=}
|
||||
kvstore:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
excluded_categories: []
|
||||
agents:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
responses_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
config:
|
||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: tavily-search
|
||||
provider_type: remote::tavily-search
|
||||
config:
|
||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: rag-runtime
|
||||
provider_type: inline::rag-runtime
|
||||
config: {}
|
||||
- provider_id: model-context-protocol
|
||||
provider_type: remote::model-context-protocol
|
||||
config: {}
|
||||
metadata_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
table_name: llamastack_kvstore
|
||||
inference_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
models:
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
model_type: embedding
|
||||
- model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
model_type: llm
|
||||
- model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: vllm-safety
|
||||
model_type: llm
|
||||
shields:
|
||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
- toolgroup_id: builtin::rag
|
||||
provider_id: rag-runtime
|
||||
server:
|
||||
port: 8323
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: llama-stack-config
|
|
@ -1,83 +0,0 @@
|
|||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: llama-benchmark-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: llama-stack-benchmark-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: llama-stack-benchmark
|
||||
app.kubernetes.io/component: server
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: llama-stack-benchmark
|
||||
app.kubernetes.io/component: server
|
||||
spec:
|
||||
containers:
|
||||
- name: llama-stack-benchmark
|
||||
image: llamastack/distribution-starter:latest
|
||||
imagePullPolicy: Always # since we have specified latest instead of a version
|
||||
env:
|
||||
- name: ENABLE_CHROMADB
|
||||
value: "true"
|
||||
- name: CHROMADB_URL
|
||||
value: http://chromadb.default.svc.cluster.local:6000
|
||||
- name: POSTGRES_HOST
|
||||
value: postgres-server.default.svc.cluster.local
|
||||
- name: POSTGRES_PORT
|
||||
value: "5432"
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${INFERENCE_MODEL}"
|
||||
- name: SAFETY_MODEL
|
||||
value: "${SAFETY_MODEL}"
|
||||
- name: TAVILY_SEARCH_API_KEY
|
||||
value: "${TAVILY_SEARCH_API_KEY}"
|
||||
- name: VLLM_URL
|
||||
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
||||
- name: VLLM_MAX_TOKENS
|
||||
value: "3072"
|
||||
- name: VLLM_SAFETY_URL
|
||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||
- name: VLLM_TLS_VERIFY
|
||||
value: "false"
|
||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
|
||||
ports:
|
||||
- containerPort: 8323
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.llama
|
||||
- name: llama-config
|
||||
mountPath: /etc/config
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: llama-benchmark-pvc
|
||||
- name: llama-config
|
||||
configMap:
|
||||
name: llama-stack-config
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: llama-stack-benchmark-service
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: llama-stack-benchmark
|
||||
app.kubernetes.io/component: server
|
||||
ports:
|
||||
- name: http
|
||||
port: 8323
|
||||
targetPort: 8323
|
||||
type: ClusterIP
|
|
@ -1,108 +0,0 @@
|
|||
version: '2'
|
||||
image_name: kubernetes-benchmark-demo
|
||||
apis:
|
||||
- agents
|
||||
- inference
|
||||
- telemetry
|
||||
- tool_runtime
|
||||
- vector_io
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
vector_io:
|
||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
||||
provider_type: remote::chromadb
|
||||
config:
|
||||
url: ${env.CHROMADB_URL:=}
|
||||
kvstore:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
agents:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
responses_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
config:
|
||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: tavily-search
|
||||
provider_type: remote::tavily-search
|
||||
config:
|
||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: rag-runtime
|
||||
provider_type: inline::rag-runtime
|
||||
config: {}
|
||||
- provider_id: model-context-protocol
|
||||
provider_type: remote::model-context-protocol
|
||||
config: {}
|
||||
metadata_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
table_name: llamastack_kvstore
|
||||
inference_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
models:
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
model_type: embedding
|
||||
- model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
model_type: llm
|
||||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
- toolgroup_id: builtin::rag
|
||||
provider_id: rag-runtime
|
||||
server:
|
||||
port: 8323
|
|
@ -1,137 +1,55 @@
|
|||
apiVersion: v1
|
||||
data:
|
||||
stack_run_config.yaml: |
|
||||
version: '2'
|
||||
image_name: kubernetes-demo
|
||||
apis:
|
||||
- agents
|
||||
- inference
|
||||
- safety
|
||||
- telemetry
|
||||
- tool_runtime
|
||||
- vector_io
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: vllm-safety
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
vector_io:
|
||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
||||
provider_type: remote::chromadb
|
||||
config:
|
||||
url: ${env.CHROMADB_URL:=}
|
||||
kvstore:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
excluded_categories: []
|
||||
agents:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
responses_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
config:
|
||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: tavily-search
|
||||
provider_type: remote::tavily-search
|
||||
config:
|
||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: rag-runtime
|
||||
provider_type: inline::rag-runtime
|
||||
config: {}
|
||||
- provider_id: model-context-protocol
|
||||
provider_type: remote::model-context-protocol
|
||||
config: {}
|
||||
metadata_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
table_name: llamastack_kvstore
|
||||
inference_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
models:
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
model_type: embedding
|
||||
- metadata: {}
|
||||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||
provider_id: vllm-safety
|
||||
model_type: llm
|
||||
shields:
|
||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
- toolgroup_id: builtin::rag
|
||||
provider_id: rag-runtime
|
||||
server:
|
||||
port: 8321
|
||||
auth:
|
||||
provider_config:
|
||||
type: github_token
|
||||
stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n-
|
||||
inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n
|
||||
\ inference:\n - provider_id: vllm-inference\n provider_type: remote::vllm\n
|
||||
\ config:\n url: ${env.VLLM_URL:=http://localhost:8000/v1}\n max_tokens:
|
||||
${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n tls_verify:
|
||||
${env.VLLM_TLS_VERIFY:=true}\n - provider_id: vllm-safety\n provider_type:
|
||||
remote::vllm\n config:\n url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n
|
||||
\ max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n
|
||||
\ tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n - provider_id: sentence-transformers\n
|
||||
\ provider_type: inline::sentence-transformers\n config: {}\n vector_io:\n
|
||||
\ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n provider_type: remote::chromadb\n
|
||||
\ config:\n url: ${env.CHROMADB_URL:=}\n kvstore:\n type: postgres\n
|
||||
\ host: ${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n
|
||||
\ db: ${env.POSTGRES_DB:=llamastack}\n user: ${env.POSTGRES_USER:=llamastack}\n
|
||||
\ password: ${env.POSTGRES_PASSWORD:=llamastack}\n files:\n - provider_id:
|
||||
meta-reference-files\n provider_type: inline::localfs\n config:\n storage_dir:
|
||||
${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n metadata_store:\n
|
||||
\ type: sqlite\n db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
|
||||
\ \n safety:\n - provider_id: llama-guard\n provider_type: inline::llama-guard\n
|
||||
\ config:\n excluded_categories: []\n agents:\n - provider_id: meta-reference\n
|
||||
\ provider_type: inline::meta-reference\n config:\n persistence_store:\n
|
||||
\ type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n port:
|
||||
${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user:
|
||||
${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
|
||||
\ responses_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n
|
||||
\ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n
|
||||
\ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
|
||||
\ telemetry:\n - provider_id: meta-reference\n provider_type: inline::meta-reference\n
|
||||
\ config:\n service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n sinks:
|
||||
${env.TELEMETRY_SINKS:=console}\n tool_runtime:\n - provider_id: brave-search\n
|
||||
\ provider_type: remote::brave-search\n config:\n api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n
|
||||
\ max_results: 3\n - provider_id: tavily-search\n provider_type: remote::tavily-search\n
|
||||
\ config:\n api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n max_results:
|
||||
3\n - provider_id: rag-runtime\n provider_type: inline::rag-runtime\n config:
|
||||
{}\n - provider_id: model-context-protocol\n provider_type: remote::model-context-protocol\n
|
||||
\ config: {}\nmetadata_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n
|
||||
\ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user:
|
||||
${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
|
||||
\ table_name: llamastack_kvstore\ninference_store:\n type: postgres\n host:
|
||||
${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n
|
||||
\ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n-
|
||||
metadata:\n embedding_dimension: 384\n model_id: all-MiniLM-L6-v2\n provider_id:
|
||||
sentence-transformers\n model_type: embedding\n- metadata: {}\n model_id: ${env.INFERENCE_MODEL}\n
|
||||
\ provider_id: vllm-inference\n model_type: llm\n- metadata: {}\n model_id:
|
||||
${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n provider_id: vllm-safety\n
|
||||
\ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs:
|
||||
[]\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id:
|
||||
builtin::websearch\n provider_id: tavily-search\n- toolgroup_id: builtin::rag\n
|
||||
\ provider_id: rag-runtime\nserver:\n port: 8321\n auth:\n provider_config:\n
|
||||
\ type: github_token\n"
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
|
|
|
@ -3,6 +3,7 @@ image_name: kubernetes-demo
|
|||
apis:
|
||||
- agents
|
||||
- inference
|
||||
- files
|
||||
- safety
|
||||
- telemetry
|
||||
- tool_runtime
|
||||
|
@ -38,6 +39,14 @@ providers:
|
|||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
files:
|
||||
- provider_id: meta-reference-files
|
||||
provider_type: inline::localfs
|
||||
config:
|
||||
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
|
||||
metadata_store:
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
|
|
|
@ -18,12 +18,13 @@ embedding_model_id = (
|
|||
).identifier
|
||||
embedding_dimension = em.metadata["embedding_dimension"]
|
||||
|
||||
_ = client.vector_dbs.register(
|
||||
vector_db = client.vector_dbs.register(
|
||||
vector_db_id=vector_db_id,
|
||||
embedding_model=embedding_model_id,
|
||||
embedding_dimension=embedding_dimension,
|
||||
provider_id="faiss",
|
||||
)
|
||||
vector_db_id = vector_db.identifier
|
||||
source = "https://www.paulgraham.com/greatwork.html"
|
||||
print("rag_tool> Ingesting document:", source)
|
||||
document = RAGDocument(
|
||||
|
@ -35,7 +36,7 @@ document = RAGDocument(
|
|||
client.tool_runtime.rag_tool.insert(
|
||||
documents=[document],
|
||||
vector_db_id=vector_db_id,
|
||||
chunk_size_in_tokens=50,
|
||||
chunk_size_in_tokens=100,
|
||||
)
|
||||
agent = Agent(
|
||||
client,
|
||||
|
|
|
@ -7,4 +7,5 @@ Here's a list of known external providers that you can use with Llama Stack:
|
|||
| KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
|
||||
| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
|
||||
| RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
|
||||
| TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
|
||||
| TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
|
||||
| MongoDB | VectorIO with MongoDB | Vector_IO | Remote | [mongodb-llama-stack](https://github.com/mongodb-partners/mongodb-llama-stack) |
|
||||
|
|
|
@ -15,8 +15,8 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
|
|||
| `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
|
||||
| `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
|
||||
| `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
|
||||
| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
|
||||
| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
|
||||
| `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
|
||||
| `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
|
||||
| `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
|
||||
|
||||
## Sample Configuration
|
||||
|
|
|
@ -15,8 +15,8 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.
|
|||
| `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
|
||||
| `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
|
||||
| `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
|
||||
| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
|
||||
| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
|
||||
| `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
|
||||
| `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
|
||||
| `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
|
||||
|
||||
## Sample Configuration
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue