Merge branch 'main' into content-extension

This commit is contained in:
Francisco Arceo 2025-08-25 14:22:15 -06:00 committed by GitHub
commit 3e11e1472c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
334 changed files with 22841 additions and 8940 deletions

View file

@ -4605,6 +4605,49 @@
}
}
},
"/v1/inference/rerank": {
"post": {
"responses": {
"200": {
"description": "RerankResponse with indices sorted by relevance score (descending).",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/RerankResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Inference"
],
"description": "Rerank a list of documents based on their relevance to a query.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/RerankRequest"
}
}
},
"required": true
}
}
},
"/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume": {
"post": {
"responses": {
@ -8821,6 +8864,61 @@
"title": "OpenAIResponseOutputMessageMCPListTools",
"description": "MCP list tools output message containing available tools from an MCP server."
},
"OpenAIResponseContentPart": {
"oneOf": [
{
"$ref": "#/components/schemas/OpenAIResponseContentPartOutputText"
},
{
"$ref": "#/components/schemas/OpenAIResponseContentPartRefusal"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"output_text": "#/components/schemas/OpenAIResponseContentPartOutputText",
"refusal": "#/components/schemas/OpenAIResponseContentPartRefusal"
}
}
},
"OpenAIResponseContentPartOutputText": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "output_text",
"default": "output_text"
},
"text": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"type",
"text"
],
"title": "OpenAIResponseContentPartOutputText"
},
"OpenAIResponseContentPartRefusal": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "refusal",
"default": "refusal"
},
"refusal": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"type",
"refusal"
],
"title": "OpenAIResponseContentPartRefusal"
},
"OpenAIResponseObjectStream": {
"oneOf": [
{
@ -8877,6 +8975,12 @@
{
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted"
},
{
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded"
},
{
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone"
},
{
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
}
@ -8902,6 +9006,8 @@
"response.mcp_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress",
"response.mcp_call.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed",
"response.mcp_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted",
"response.content_part.added": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded",
"response.content_part.done": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone",
"response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
}
}
@ -8928,6 +9034,80 @@
"title": "OpenAIResponseObjectStreamResponseCompleted",
"description": "Streaming event indicating a response has been completed."
},
"OpenAIResponseObjectStreamResponseContentPartAdded": {
"type": "object",
"properties": {
"response_id": {
"type": "string",
"description": "Unique identifier of the response containing this content"
},
"item_id": {
"type": "string",
"description": "Unique identifier of the output item containing this content part"
},
"part": {
"$ref": "#/components/schemas/OpenAIResponseContentPart",
"description": "The content part that was added"
},
"sequence_number": {
"type": "integer",
"description": "Sequential number for ordering streaming events"
},
"type": {
"type": "string",
"const": "response.content_part.added",
"default": "response.content_part.added",
"description": "Event type identifier, always \"response.content_part.added\""
}
},
"additionalProperties": false,
"required": [
"response_id",
"item_id",
"part",
"sequence_number",
"type"
],
"title": "OpenAIResponseObjectStreamResponseContentPartAdded",
"description": "Streaming event for when a new content part is added to a response item."
},
"OpenAIResponseObjectStreamResponseContentPartDone": {
"type": "object",
"properties": {
"response_id": {
"type": "string",
"description": "Unique identifier of the response containing this content"
},
"item_id": {
"type": "string",
"description": "Unique identifier of the output item containing this content part"
},
"part": {
"$ref": "#/components/schemas/OpenAIResponseContentPart",
"description": "The completed content part"
},
"sequence_number": {
"type": "integer",
"description": "Sequential number for ordering streaming events"
},
"type": {
"type": "string",
"const": "response.content_part.done",
"default": "response.content_part.done",
"description": "Event type identifier, always \"response.content_part.done\""
}
},
"additionalProperties": false,
"required": [
"response_id",
"item_id",
"part",
"sequence_number",
"type"
],
"title": "OpenAIResponseObjectStreamResponseContentPartDone",
"description": "Streaming event for when a content part is completed."
},
"OpenAIResponseObjectStreamResponseCreated": {
"type": "object",
"properties": {
@ -14630,7 +14810,8 @@
"OpenAIFilePurpose": {
"type": "string",
"enum": [
"assistants"
"assistants",
"batch"
],
"title": "OpenAIFilePurpose",
"description": "Valid purpose values for OpenAI Files API."
@ -14707,7 +14888,8 @@
"purpose": {
"type": "string",
"enum": [
"assistants"
"assistants",
"batch"
],
"description": "The intended purpose of the file"
}
@ -15926,12 +16108,16 @@
"value": {
"type": "number",
"description": "The numeric value of the metric at this timestamp"
},
"unit": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"timestamp",
"value"
"value",
"unit"
],
"title": "MetricDataPoint",
"description": "A single data point in a metric time series."
@ -16489,6 +16675,95 @@
],
"title": "RegisterVectorDbRequest"
},
"RerankRequest": {
"type": "object",
"properties": {
"model": {
"type": "string",
"description": "The identifier of the reranking model to use."
},
"query": {
"oneOf": [
{
"type": "string"
},
{
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
},
{
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
}
],
"description": "The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length."
},
"items": {
"type": "array",
"items": {
"oneOf": [
{
"type": "string"
},
{
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
},
{
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
}
]
},
"description": "List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length."
},
"max_num_results": {
"type": "integer",
"description": "(Optional) Maximum number of results to return. Default: returns all."
}
},
"additionalProperties": false,
"required": [
"model",
"query",
"items"
],
"title": "RerankRequest"
},
"RerankData": {
"type": "object",
"properties": {
"index": {
"type": "integer",
"description": "The original index of the document in the input list"
},
"relevance_score": {
"type": "number",
"description": "The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance."
}
},
"additionalProperties": false,
"required": [
"index",
"relevance_score"
],
"title": "RerankData",
"description": "A single rerank result from a reranking response."
},
"RerankResponse": {
"type": "object",
"properties": {
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/RerankData"
},
"description": "List of rerank result objects, sorted by relevance score (descending)"
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "RerankResponse",
"description": "Response from a reranking request."
},
"ResumeAgentTurnRequest": {
"type": "object",
"properties": {

View file

@ -3264,6 +3264,37 @@ paths:
schema:
$ref: '#/components/schemas/QueryTracesRequest'
required: true
/v1/inference/rerank:
post:
responses:
'200':
description: >-
RerankResponse with indices sorted by relevance score (descending).
content:
application/json:
schema:
$ref: '#/components/schemas/RerankResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
description: >-
Rerank a list of documents based on their relevance to a query.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/RerankRequest'
required: true
/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume:
post:
responses:
@ -6441,6 +6472,43 @@ components:
title: OpenAIResponseOutputMessageMCPListTools
description: >-
MCP list tools output message containing available tools from an MCP server.
OpenAIResponseContentPart:
oneOf:
- $ref: '#/components/schemas/OpenAIResponseContentPartOutputText'
- $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
discriminator:
propertyName: type
mapping:
output_text: '#/components/schemas/OpenAIResponseContentPartOutputText'
refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
OpenAIResponseContentPartOutputText:
type: object
properties:
type:
type: string
const: output_text
default: output_text
text:
type: string
additionalProperties: false
required:
- type
- text
title: OpenAIResponseContentPartOutputText
OpenAIResponseContentPartRefusal:
type: object
properties:
type:
type: string
const: refusal
default: refusal
refusal:
type: string
additionalProperties: false
required:
- type
- refusal
title: OpenAIResponseContentPartRefusal
OpenAIResponseObjectStream:
oneOf:
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
@ -6461,6 +6529,8 @@ components:
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
discriminator:
propertyName: type
@ -6483,6 +6553,8 @@ components:
response.mcp_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
response.mcp_call.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
response.mcp_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
response.content_part.added: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
response.content_part.done: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
"OpenAIResponseObjectStreamResponseCompleted":
type: object
@ -6504,6 +6576,76 @@ components:
OpenAIResponseObjectStreamResponseCompleted
description: >-
Streaming event indicating a response has been completed.
"OpenAIResponseObjectStreamResponseContentPartAdded":
type: object
properties:
response_id:
type: string
description: >-
Unique identifier of the response containing this content
item_id:
type: string
description: >-
Unique identifier of the output item containing this content part
part:
$ref: '#/components/schemas/OpenAIResponseContentPart'
description: The content part that was added
sequence_number:
type: integer
description: >-
Sequential number for ordering streaming events
type:
type: string
const: response.content_part.added
default: response.content_part.added
description: >-
Event type identifier, always "response.content_part.added"
additionalProperties: false
required:
- response_id
- item_id
- part
- sequence_number
- type
title: >-
OpenAIResponseObjectStreamResponseContentPartAdded
description: >-
Streaming event for when a new content part is added to a response item.
"OpenAIResponseObjectStreamResponseContentPartDone":
type: object
properties:
response_id:
type: string
description: >-
Unique identifier of the response containing this content
item_id:
type: string
description: >-
Unique identifier of the output item containing this content part
part:
$ref: '#/components/schemas/OpenAIResponseContentPart'
description: The completed content part
sequence_number:
type: integer
description: >-
Sequential number for ordering streaming events
type:
type: string
const: response.content_part.done
default: response.content_part.done
description: >-
Event type identifier, always "response.content_part.done"
additionalProperties: false
required:
- response_id
- item_id
- part
- sequence_number
- type
title: >-
OpenAIResponseObjectStreamResponseContentPartDone
description: >-
Streaming event for when a content part is completed.
"OpenAIResponseObjectStreamResponseCreated":
type: object
properties:
@ -10840,6 +10982,7 @@ components:
type: string
enum:
- assistants
- batch
title: OpenAIFilePurpose
description: >-
Valid purpose values for OpenAI Files API.
@ -10908,6 +11051,7 @@ components:
type: string
enum:
- assistants
- batch
description: The intended purpose of the file
additionalProperties: false
required:
@ -11838,10 +11982,13 @@ components:
type: number
description: >-
The numeric value of the metric at this timestamp
unit:
type: string
additionalProperties: false
required:
- timestamp
- value
- unit
title: MetricDataPoint
description: >-
A single data point in a metric time series.
@ -12252,6 +12399,76 @@ components:
- vector_db_id
- embedding_model
title: RegisterVectorDbRequest
RerankRequest:
type: object
properties:
model:
type: string
description: >-
The identifier of the reranking model to use.
query:
oneOf:
- type: string
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
description: >-
The search query to rank items against. Can be a string, text content
part, or image content part. The input must not exceed the model's max
input token length.
items:
type: array
items:
oneOf:
- type: string
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
description: >-
List of items to rerank. Each item can be a string, text content part,
or image content part. Each input must not exceed the model's max input
token length.
max_num_results:
type: integer
description: >-
(Optional) Maximum number of results to return. Default: returns all.
additionalProperties: false
required:
- model
- query
- items
title: RerankRequest
RerankData:
type: object
properties:
index:
type: integer
description: >-
The original index of the document in the input list
relevance_score:
type: number
description: >-
The relevance score from the model output. Values are inverted when applicable
so that higher scores indicate greater relevance.
additionalProperties: false
required:
- index
- relevance_score
title: RerankData
description: >-
A single rerank result from a reranking response.
RerankResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/RerankData'
description: >-
List of rerank result objects, sorted by relevance score (descending)
additionalProperties: false
required:
- data
title: RerankResponse
description: Response from a reranking request.
ResumeAgentTurnRequest:
type: object
properties:

View file

@ -18,3 +18,4 @@ We are working on adding a few more APIs to complete the application lifecycle.
- **Batch Inference**: run inference on a dataset of inputs
- **Batch Agents**: run agents on a dataset of inputs
- **Synthetic Data Generation**: generate synthetic data for model development
- **Batches**: OpenAI-compatible batch management for inference

View file

@ -4,11 +4,11 @@
## Adding a New Provider
See the [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
See:
- [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
- [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
- [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
See the [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
See the [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
```{toctree}
:maxdepth: 1
:hidden:
@ -19,11 +19,21 @@ new_vector_database
## Testing
See the [Test Page](testing.md) which describes how to test your changes.
```{include} ../../../tests/README.md
```
## Advanced Topics
For developers who need deeper understanding of the testing system internals:
```{toctree}
:maxdepth: 1
:hidden:
:caption: Testing
testing
```
testing/record-replay
```
### Benchmarking
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
```

View file

@ -1,8 +0,0 @@
```{include} ../../../tests/README.md
```
```{include} ../../../tests/unit/README.md
```
```{include} ../../../tests/integration/README.md
```

View file

@ -0,0 +1,234 @@
# Record-Replay System
Understanding how Llama Stack captures and replays API interactions for testing.
## Overview
The record-replay system solves a fundamental challenge in AI testing: how do you test against expensive, non-deterministic APIs without breaking the bank or dealing with flaky tests?
The solution: intercept API calls, store real responses, and replay them later. This gives you real API behavior without the cost or variability.
## How It Works
### Request Hashing
Every API request gets converted to a deterministic hash for lookup:
```python
def normalize_request(method: str, url: str, headers: dict, body: dict) -> str:
normalized = {
"method": method.upper(),
"endpoint": urlparse(url).path, # Just the path, not full URL
"body": body, # Request parameters
}
return hashlib.sha256(json.dumps(normalized, sort_keys=True).encode()).hexdigest()
```
**Key insight:** The hashing is intentionally precise. Different whitespace, float precision, or parameter order produces different hashes. This prevents subtle bugs from false cache hits.
```python
# These produce DIFFERENT hashes:
{"content": "Hello world"}
{"content": "Hello world\n"}
{"temperature": 0.7}
{"temperature": 0.7000001}
```
### Client Interception
The system patches OpenAI and Ollama client methods to intercept calls before they leave your application. This happens transparently - your test code doesn't change.
### Storage Architecture
Recordings use a two-tier storage system optimized for both speed and debuggability:
```
recordings/
├── index.sqlite # Fast lookup by request hash
└── responses/
├── abc123def456.json # Individual response files
└── def789ghi012.json
```
**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
**JSON files** store complete request/response pairs in human-readable format for debugging.
## Recording Modes
### LIVE Mode
Direct API calls with no recording or replay:
```python
with inference_recording(mode=InferenceMode.LIVE):
response = await client.chat.completions.create(...)
```
Use for initial development and debugging against real APIs.
### RECORD Mode
Captures API interactions while passing through real responses:
```python
with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
response = await client.chat.completions.create(...)
# Real API call made, response captured AND returned
```
The recording process:
1. Request intercepted and hashed
2. Real API call executed
3. Response captured and serialized
4. Recording stored to disk
5. Original response returned to caller
### REPLAY Mode
Returns stored responses instead of making API calls:
```python
with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
response = await client.chat.completions.create(...)
# No API call made, cached response returned instantly
```
The replay process:
1. Request intercepted and hashed
2. Hash looked up in SQLite index
3. Response loaded from JSON file
4. Response deserialized and returned
5. Error if no recording found
## Streaming Support
Streaming APIs present a unique challenge: how do you capture an async generator?
### The Problem
```python
# How do you record this?
async for chunk in client.chat.completions.create(stream=True):
process(chunk)
```
### The Solution
The system captures all chunks immediately before yielding any:
```python
async def handle_streaming_record(response):
# Capture complete stream first
chunks = []
async for chunk in response:
chunks.append(chunk)
# Store complete recording
storage.store_recording(
request_hash, request_data, {"body": chunks, "is_streaming": True}
)
# Return generator that replays captured chunks
async def replay_stream():
for chunk in chunks:
yield chunk
return replay_stream()
```
This ensures:
- **Complete capture** - The entire stream is saved atomically
- **Interface preservation** - The returned object behaves like the original API
- **Deterministic replay** - Same chunks in the same order every time
## Serialization
API responses contain complex Pydantic objects that need careful serialization:
```python
def _serialize_response(response):
if hasattr(response, "model_dump"):
# Preserve type information for proper deserialization
return {
"__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
"__data__": response.model_dump(mode="json"),
}
return response
```
This preserves type safety - when replayed, you get the same Pydantic objects with all their validation and methods.
## Environment Integration
### Environment Variables
Control recording behavior globally:
```bash
export LLAMA_STACK_TEST_INFERENCE_MODE=replay
export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
pytest tests/integration/
```
### Pytest Integration
The system integrates automatically based on environment variables, requiring no changes to test code.
## Debugging Recordings
### Inspecting Storage
```bash
# See what's recorded
sqlite3 recordings/index.sqlite "SELECT endpoint, model, timestamp FROM recordings LIMIT 10;"
# View specific response
cat recordings/responses/abc123def456.json | jq '.response.body'
# Find recordings by endpoint
sqlite3 recordings/index.sqlite "SELECT * FROM recordings WHERE endpoint='/v1/chat/completions';"
```
### Common Issues
**Hash mismatches:** Request parameters changed slightly between record and replay
```bash
# Compare request details
cat recordings/responses/abc123.json | jq '.request'
```
**Serialization errors:** Response types changed between versions
```bash
# Re-record with updated types
rm recordings/responses/failing_hash.json
LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_failing.py
```
**Missing recordings:** New test or changed parameters
```bash
# Record the missing interaction
LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_new.py
```
## Design Decisions
### Why Not Mocks?
Traditional mocking breaks down with AI APIs because:
- Response structures are complex and evolve frequently
- Streaming behavior is hard to mock correctly
- Edge cases in real APIs get missed
- Mocks become brittle maintenance burdens
### Why Precise Hashing?
Loose hashing (normalizing whitespace, rounding floats) seems convenient but hides bugs. If a test changes slightly, you want to know about it rather than accidentally getting the wrong cached response.
### Why JSON + SQLite?
- **JSON** - Human readable, diff-friendly, easy to inspect and modify
- **SQLite** - Fast indexed lookups without loading response bodies
- **Hybrid** - Best of both worlds for different use cases
This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.

View file

@ -225,8 +225,32 @@ server:
port: 8321 # Port to listen on (default: 8321)
tls_certfile: "/path/to/cert.pem" # Optional: Path to TLS certificate for HTTPS
tls_keyfile: "/path/to/key.pem" # Optional: Path to TLS key for HTTPS
cors: true # Optional: Enable CORS (dev mode) or full config object
```
### CORS Configuration
CORS (Cross-Origin Resource Sharing) can be configured in two ways:
**Local development** (allows localhost origins only):
```yaml
server:
cors: true
```
**Explicit configuration** (custom origins and settings):
```yaml
server:
cors:
allow_origins: ["https://myapp.com", "https://app.example.com"]
allow_methods: ["GET", "POST", "PUT", "DELETE"]
allow_headers: ["Content-Type", "Authorization"]
allow_credentials: true
max_age: 3600
```
When `cors: true`, the server enables secure localhost-only access for local development. For production, specify exact origins to maintain security.
### Authentication Configuration
> **Breaking Change (v0.2.14)**: The authentication configuration structure has changed. The previous format with `provider_type` and `config` fields has been replaced with a unified `provider_config` field that includes the `type` field. Update your configuration files accordingly.
@ -618,6 +642,54 @@ Content-Type: application/json
}
```
### CORS Configuration
Configure CORS to allow web browsers to make requests from different domains. Disabled by default.
#### Quick Setup
For development, use the simple boolean flag:
```yaml
server:
cors: true # Auto-enables localhost with any port
```
This automatically allows `http://localhost:*` and `https://localhost:*` with secure defaults.
#### Custom Configuration
For specific origins and full control:
```yaml
server:
cors:
allow_origins: ["https://myapp.com", "https://staging.myapp.com"]
allow_credentials: true
allow_methods: ["GET", "POST", "PUT", "DELETE"]
allow_headers: ["Content-Type", "Authorization"]
allow_origin_regex: "https://.*\\.example\\.com" # Optional regex pattern
expose_headers: ["X-Total-Count"]
max_age: 86400
```
#### Configuration Options
| Field | Description | Default |
| -------------------- | ---------------------------------------------- | ------- |
| `allow_origins` | List of allowed origins. Use `["*"]` for any. | `["*"]` |
| `allow_origin_regex` | Regex pattern for allowed origins (optional). | `None` |
| `allow_methods` | Allowed HTTP methods. | `["*"]` |
| `allow_headers` | Allowed headers. | `["*"]` |
| `allow_credentials` | Allow credentials (cookies, auth headers). | `false` |
| `expose_headers` | Headers exposed to browser. | `[]` |
| `max_age` | Preflight cache time (seconds). | `600` |
**Security Notes**:
- `allow_credentials: true` requires explicit origins (no wildcards)
- `cors: true` enables localhost access only (secure for development)
- For public APIs, always specify exact allowed origins
## Extending to handle Safety
Configuring Safety can be a little involved so it is instructive to go through an example.

View file

@ -17,7 +17,6 @@ client = LlamaStackAsLibraryClient(
# provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
)
client.initialize()
```
This will parse your config and set up any inline implementations and remote clients needed for your implementation.
@ -32,5 +31,4 @@ If you've created a [custom distribution](https://llama-stack.readthedocs.io/en/
```python
client = LlamaStackAsLibraryClient(config_path)
client.initialize()
```

View file

@ -0,0 +1,156 @@
# Llama Stack Benchmark Suite on Kubernetes
## Motivation
Performance benchmarking is critical for understanding the overhead and characteristics of the Llama Stack abstraction layer compared to direct inference engines like vLLM.
### Why This Benchmark Suite Exists
**Performance Validation**: The Llama Stack provides a unified API layer across multiple inference providers, but this abstraction introduces potential overhead. This benchmark suite quantifies the performance impact by comparing:
- Llama Stack inference (with vLLM backend)
- Direct vLLM inference calls
- Both under identical Kubernetes deployment conditions
**Production Readiness Assessment**: Real-world deployments require understanding performance characteristics under load. This suite simulates concurrent user scenarios with configurable parameters (duration, concurrency, request patterns) to validate production readiness.
**Regression Detection (TODO)**: As the Llama Stack evolves, this benchmark provides automated regression detection for performance changes. CI/CD pipelines can leverage these benchmarks to catch performance degradations before production deployments.
**Resource Planning**: By measuring throughput, latency percentiles, and resource utilization patterns, teams can make informed decisions about:
- Kubernetes resource allocation (CPU, memory, GPU)
- Auto-scaling configurations
- Cost optimization strategies
### Key Metrics Captured
The benchmark suite measures critical performance indicators:
- **Throughput**: Requests per second under sustained load
- **Latency Distribution**: P50, P95, P99 response times
- **Time to First Token (TTFT)**: Critical for streaming applications
- **Error Rates**: Request failures and timeout analysis
This data enables data-driven architectural decisions and performance optimization efforts.
## Setup
**1. Deploy base k8s infrastructure:**
```bash
cd ../k8s
./apply.sh
```
**2. Deploy benchmark components:**
```bash
cd ../k8s-benchmark
./apply.sh
```
**3. Verify deployment:**
```bash
kubectl get pods
# Should see: llama-stack-benchmark-server, vllm-server, etc.
```
## Quick Start
### Basic Benchmarks
**Benchmark Llama Stack (default):**
```bash
cd docs/source/distributions/k8s-benchmark/
./run-benchmark.sh
```
**Benchmark vLLM direct:**
```bash
./run-benchmark.sh --target vllm
```
### Custom Configuration
**Extended benchmark with high concurrency:**
```bash
./run-benchmark.sh --target vllm --duration 120 --concurrent 20
```
**Short test run:**
```bash
./run-benchmark.sh --target stack --duration 30 --concurrent 5
```
## Command Reference
### run-benchmark.sh Options
```bash
./run-benchmark.sh [options]
Options:
-t, --target <stack|vllm> Target to benchmark (default: stack)
-d, --duration <seconds> Duration in seconds (default: 60)
-c, --concurrent <users> Number of concurrent users (default: 10)
-h, --help Show help message
Examples:
./run-benchmark.sh --target vllm # Benchmark vLLM direct
./run-benchmark.sh --target stack # Benchmark Llama Stack
./run-benchmark.sh -t vllm -d 120 -c 20 # vLLM with 120s, 20 users
```
## Local Testing
### Running Benchmark Locally
For local development without Kubernetes:
**1. Start OpenAI mock server:**
```bash
uv run python openai-mock-server.py --port 8080
```
**2. Run benchmark against mock server:**
```bash
uv run python benchmark.py \
--base-url http://localhost:8080/v1 \
--model mock-inference \
--duration 30 \
--concurrent 5
```
**3. Test against local vLLM server:**
```bash
# If you have vLLM running locally on port 8000
uv run python benchmark.py \
--base-url http://localhost:8000/v1 \
--model meta-llama/Llama-3.2-3B-Instruct \
--duration 30 \
--concurrent 5
```
**4. Profile the running server:**
```bash
./profile_running_server.sh
```
### OpenAI Mock Server
The `openai-mock-server.py` provides:
- **OpenAI-compatible API** for testing without real models
- **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
- **Consistent responses** for reproducible benchmarks
- **Lightweight testing** without GPU requirements
**Mock server usage:**
```bash
uv run python openai-mock-server.py --port 8080
```
The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
## Files in this Directory
- `benchmark.py` - Core benchmark script with async streaming support
- `run-benchmark.sh` - Main script with target selection and configuration
- `openai-mock-server.py` - Mock OpenAI API server for local testing
- `README.md` - This documentation file

View file

@ -8,7 +8,6 @@
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
export MOCK_INFERENCE_PORT=8080
export STREAM_DELAY_SECONDS=0.005
export POSTGRES_USER=llamastack
@ -20,14 +19,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
export MOCK_INFERENCE_MODEL=mock-inference
# Use llama-stack-benchmark-service as the benchmark server
export LOCUST_HOST=http://llama-stack-benchmark-service:8323
export LOCUST_BASE_PATH=/v1/openai/v1
# Use vllm-service as the benchmark server
# export LOCUST_HOST=http://vllm-server:8000
# export LOCUST_BASE_PATH=/v1
export MOCK_INFERENCE_URL=openai-mock-service:8080
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
@ -35,13 +27,6 @@ set -euo pipefail
set -x
# Deploy benchmark-specific components
# Deploy OpenAI mock server
kubectl create configmap openai-mock --from-file=openai-mock-server.py \
--dry-run=client -o yaml | kubectl apply --validate=false -f -
envsubst < openai-mock-deployment.yaml | kubectl apply --validate=false -f -
# Create configmap with our custom stack config
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
--dry-run=client -o yaml > stack-configmap.yaml
@ -49,9 +34,3 @@ kubectl apply --validate=false -f stack-configmap.yaml
# Deploy our custom llama stack server (overriding the base one)
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
# Deploy Locust load testing
kubectl create configmap locust-script --from-file=locustfile.py \
--dry-run=client -o yaml | kubectl apply --validate=false -f -
envsubst < locust-k8s.yaml | kubectl apply --validate=false -f -

View file

@ -0,0 +1,267 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
Simple benchmark script for Llama Stack with OpenAI API compatibility.
"""
import argparse
import asyncio
import os
import random
import statistics
import time
from typing import Tuple
import aiohttp
class BenchmarkStats:
def __init__(self):
self.response_times = []
self.ttft_times = []
self.chunks_received = []
self.errors = []
self.success_count = 0
self.total_requests = 0
self.concurrent_users = 0
self.start_time = None
self.end_time = None
self._lock = asyncio.Lock()
async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
async with self._lock:
self.total_requests += 1
if error:
self.errors.append(error)
else:
self.success_count += 1
self.response_times.append(response_time)
self.chunks_received.append(chunks)
if ttft is not None:
self.ttft_times.append(ttft)
def print_summary(self):
if not self.response_times:
print("No successful requests to report")
if self.errors:
print(f"Total errors: {len(self.errors)}")
print("First 5 errors:")
for error in self.errors[:5]:
print(f" {error}")
return
total_time = self.end_time - self.start_time
success_rate = (self.success_count / self.total_requests) * 100
print(f"\n{'='*60}")
print(f"BENCHMARK RESULTS")
print(f"{'='*60}")
print(f"Total time: {total_time:.2f}s")
print(f"Concurrent users: {self.concurrent_users}")
print(f"Total requests: {self.total_requests}")
print(f"Successful requests: {self.success_count}")
print(f"Failed requests: {len(self.errors)}")
print(f"Success rate: {success_rate:.1f}%")
print(f"Requests per second: {self.success_count / total_time:.2f}")
print(f"\nResponse Time Statistics:")
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
print(f" Median: {statistics.median(self.response_times):.3f}s")
print(f" Min: {min(self.response_times):.3f}s")
print(f" Max: {max(self.response_times):.3f}s")
if len(self.response_times) > 1:
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
percentiles = [50, 90, 95, 99]
sorted_times = sorted(self.response_times)
print(f"\nPercentiles:")
for p in percentiles:
idx = int(len(sorted_times) * p / 100) - 1
idx = max(0, min(idx, len(sorted_times) - 1))
print(f" P{p}: {sorted_times[idx]:.3f}s")
if self.ttft_times:
print(f"\nTime to First Token (TTFT) Statistics:")
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
print(f" Min: {min(self.ttft_times):.3f}s")
print(f" Max: {max(self.ttft_times):.3f}s")
if len(self.ttft_times) > 1:
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
sorted_ttft = sorted(self.ttft_times)
print(f"\nTTFT Percentiles:")
for p in percentiles:
idx = int(len(sorted_ttft) * p / 100) - 1
idx = max(0, min(idx, len(sorted_ttft) - 1))
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
if self.chunks_received:
print(f"\nStreaming Statistics:")
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
print(f" Total chunks received: {sum(self.chunks_received)}")
if self.errors:
print(f"\nErrors (showing first 5):")
for error in self.errors[:5]:
print(f" {error}")
class LlamaStackBenchmark:
def __init__(self, base_url: str, model_id: str):
self.base_url = base_url.rstrip('/')
self.model_id = model_id
self.headers = {"Content-Type": "application/json"}
self.test_messages = [
[{"role": "user", "content": "Hi"}],
[{"role": "user", "content": "What is the capital of France?"}],
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
[
{"role": "user", "content": "What is machine learning?"},
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
{"role": "user", "content": "Can you give me a practical example?"}
]
]
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
"""Make a single async streaming chat completion request."""
messages = random.choice(self.test_messages)
payload = {
"model": self.model_id,
"messages": messages,
"stream": True,
"max_tokens": 100
}
start_time = time.time()
chunks_received = 0
ttft = None
error = None
session = aiohttp.ClientSession()
try:
async with session.post(
f"{self.base_url}/chat/completions",
headers=self.headers,
json=payload,
timeout=aiohttp.ClientTimeout(total=30)
) as response:
if response.status == 200:
async for line in response.content:
if line:
line_str = line.decode('utf-8').strip()
if line_str.startswith('data: '):
chunks_received += 1
if ttft is None:
ttft = time.time() - start_time
if line_str == 'data: [DONE]':
break
if chunks_received == 0:
error = "No streaming chunks received"
else:
text = await response.text()
error = f"HTTP {response.status}: {text[:100]}"
except Exception as e:
error = f"Request error: {str(e)}"
finally:
await session.close()
response_time = time.time() - start_time
return response_time, chunks_received, ttft, error
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
"""Run benchmark using async requests for specified duration."""
stats = BenchmarkStats()
stats.concurrent_users = concurrent_users
stats.start_time = time.time()
print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
print(f"Target URL: {self.base_url}/chat/completions")
print(f"Model: {self.model_id}")
connector = aiohttp.TCPConnector(limit=concurrent_users)
async with aiohttp.ClientSession(connector=connector) as session:
async def worker(worker_id: int):
"""Worker that sends requests sequentially until canceled."""
request_count = 0
while True:
try:
response_time, chunks, ttft, error = await self.make_async_streaming_request()
await stats.add_result(response_time, chunks, ttft, error)
request_count += 1
except asyncio.CancelledError:
break
except Exception as e:
await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
# Progress reporting task
async def progress_reporter():
last_report_time = time.time()
while True:
try:
await asyncio.sleep(1) # Report every second
if time.time() >= last_report_time + 10: # Report every 10 seconds
elapsed = time.time() - stats.start_time
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
last_report_time = time.time()
except asyncio.CancelledError:
break
# Spawn concurrent workers
tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
progress_task = asyncio.create_task(progress_reporter())
tasks.append(progress_task)
# Wait for duration then cancel all tasks
await asyncio.sleep(duration)
for task in tasks:
task.cancel()
# Wait for all tasks to complete
await asyncio.gather(*tasks, return_exceptions=True)
stats.end_time = time.time()
return stats
def main():
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
help="Model ID to use for requests")
parser.add_argument("--duration", type=int, default=60,
help="Duration in seconds to run benchmark (default: 60)")
parser.add_argument("--concurrent", type=int, default=10,
help="Number of concurrent users (default: 10)")
args = parser.parse_args()
benchmark = LlamaStackBenchmark(args.base_url, args.model)
try:
stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
stats.print_summary()
except KeyboardInterrupt:
print("\nBenchmark interrupted by user")
except Exception as e:
print(f"Benchmark failed: {e}")
if __name__ == "__main__":
main()

View file

@ -1,131 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: locust-master
labels:
app: locust
role: master
spec:
replicas: 1
selector:
matchLabels:
app: locust
role: master
template:
metadata:
labels:
app: locust
role: master
spec:
containers:
- name: locust-master
image: locustio/locust:2.31.8
ports:
- containerPort: 8089 # Web UI
- containerPort: 5557 # Master communication
env:
- name: LOCUST_HOST
value: "${LOCUST_HOST}"
- name: LOCUST_LOCUSTFILE
value: "/locust/locustfile.py"
- name: LOCUST_WEB_HOST
value: "0.0.0.0"
- name: LOCUST_MASTER
value: "true"
- name: LOCUST_BASE_PATH
value: "${LOCUST_BASE_PATH}"
- name: INFERENCE_MODEL
value: "${BENCHMARK_INFERENCE_MODEL}"
volumeMounts:
- name: locust-script
mountPath: /locust
command: ["locust"]
args:
- "--master"
- "--web-host=0.0.0.0"
- "--web-port=8089"
- "--host=${LOCUST_HOST}"
- "--locustfile=/locust/locustfile.py"
volumes:
- name: locust-script
configMap:
name: locust-script
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: locust-worker
labels:
app: locust
role: worker
spec:
replicas: 2 # Start with 2 workers, can be scaled up
selector:
matchLabels:
app: locust
role: worker
template:
metadata:
labels:
app: locust
role: worker
spec:
containers:
- name: locust-worker
image: locustio/locust:2.31.8
env:
- name: LOCUST_HOST
value: "${LOCUST_HOST}"
- name: LOCUST_LOCUSTFILE
value: "/locust/locustfile.py"
- name: LOCUST_MASTER_HOST
value: "locust-master-service"
- name: LOCUST_MASTER_PORT
value: "5557"
- name: INFERENCE_MODEL
value: "${BENCHMARK_INFERENCE_MODEL}"
- name: LOCUST_BASE_PATH
value: "${LOCUST_BASE_PATH}"
volumeMounts:
- name: locust-script
mountPath: /locust
command: ["locust"]
args:
- "--worker"
- "--master-host=locust-master-service"
- "--master-port=5557"
- "--locustfile=/locust/locustfile.py"
volumes:
- name: locust-script
configMap:
name: locust-script
---
apiVersion: v1
kind: Service
metadata:
name: locust-master-service
spec:
selector:
app: locust
role: master
ports:
- name: web-ui
port: 8089
targetPort: 8089
- name: master-comm
port: 5557
targetPort: 5557
type: ClusterIP
---
apiVersion: v1
kind: Service
metadata:
name: locust-web-ui
spec:
selector:
app: locust
role: master
ports:
- port: 8089
targetPort: 8089
type: ClusterIP # Keep internal, use port-forward to access

View file

@ -1,78 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
Locust load testing script for Llama Stack with Prism mock OpenAI provider.
"""
import random
from locust import HttpUser, task, between
import os
base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
MODEL_ID = os.getenv("INFERENCE_MODEL")
class LlamaStackUser(HttpUser):
wait_time = between(0.0, 0.0001)
def on_start(self):
"""Setup authentication and test data."""
# No auth required for benchmark server
self.headers = {
"Content-Type": "application/json"
}
# Test messages of varying lengths
self.test_messages = [
[{"role": "user", "content": "Hi"}],
[{"role": "user", "content": "What is the capital of France?"}],
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
[
{"role": "user", "content": "What is machine learning?"},
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
{"role": "user", "content": "Can you give me a practical example?"}
]
]
@task(weight=100)
def chat_completion_streaming(self):
"""Test streaming chat completion (20% of requests)."""
messages = random.choice(self.test_messages)
payload = {
"model": MODEL_ID,
"messages": messages,
"stream": True,
"max_tokens": 100
}
with self.client.post(
f"{base_path}/chat/completions",
headers=self.headers,
json=payload,
stream=True,
catch_response=True
) as response:
if response.status_code == 200:
chunks_received = 0
try:
for line in response.iter_lines():
if line:
line_str = line.decode('utf-8')
if line_str.startswith('data: '):
chunks_received += 1
if line_str.strip() == 'data: [DONE]':
break
if chunks_received > 0:
response.success()
else:
response.failure("No streaming chunks received")
except Exception as e:
response.failure(f"Streaming error: {e}")
else:
response.failure(f"HTTP {response.status_code}: {response.text}")

View file

@ -1,52 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: openai-mock
labels:
app: openai-mock
spec:
replicas: 1
selector:
matchLabels:
app: openai-mock
template:
metadata:
labels:
app: openai-mock
spec:
containers:
- name: openai-mock
image: python:3.12-slim
ports:
- containerPort: ${MOCK_INFERENCE_PORT}
env:
- name: PORT
value: "${MOCK_INFERENCE_PORT}"
- name: MOCK_MODELS
value: "${MOCK_INFERENCE_MODEL}"
- name: STREAM_DELAY_SECONDS
value: "${STREAM_DELAY_SECONDS}"
command: ["sh", "-c"]
args:
- |
pip install flask &&
python /app/openai-mock-server.py --port ${MOCK_INFERENCE_PORT}
volumeMounts:
- name: openai-mock-script
mountPath: /app
volumes:
- name: openai-mock-script
configMap:
name: openai-mock
---
apiVersion: v1
kind: Service
metadata:
name: openai-mock-service
spec:
selector:
app: openai-mock
ports:
- port: 8080
targetPort: 8080
type: ClusterIP

View file

@ -23,7 +23,7 @@ app = Flask(__name__)
# Models from environment variables
def get_models():
models_str = os.getenv("MOCK_MODELS", "mock-inference")
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
return {
@ -49,13 +49,13 @@ def generate_random_text(length=50):
]
return " ".join(random.choices(words, k=length))
@app.route('/models', methods=['GET'])
@app.route('/v1/models', methods=['GET'])
def list_models():
models = get_models()
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
return jsonify(models)
@app.route('/chat/completions', methods=['POST'])
@app.route('/v1/chat/completions', methods=['POST'])
def chat_completions():
"""Return OpenAI-formatted chat completion responses."""
data = request.get_json()

View file

@ -0,0 +1,52 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Script to profile an already running Llama Stack server
# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
DURATION=${1:-60} # Default 60 seconds
OUTPUT_FILE=${2:-"llama_stack_profile"} # Default output file
echo "Looking for running Llama Stack server..."
# Find the server PID
SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
if [ -z "$SERVER_PID" ]; then
echo "Error: No running Llama Stack server found"
echo "Please start your server first with:"
echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
exit 1
fi
echo "Found Llama Stack server with PID: $SERVER_PID"
# Start py-spy profiling
echo "Starting py-spy profiling for ${DURATION} seconds..."
echo "Output will be saved to: ${OUTPUT_FILE}.svg"
echo ""
echo "You can now run your load test..."
echo ""
# Get the full path to py-spy
PYSPY_PATH=$(which py-spy)
# Check if running as root, if not, use sudo
if [ "$EUID" -ne 0 ]; then
echo "py-spy requires root permissions on macOS. Running with sudo..."
sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
else
"$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
fi
echo ""
echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
echo ""
echo "To view the flame graph:"
echo "open ${OUTPUT_FILE}.svg"

View file

@ -0,0 +1,148 @@
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
set -euo pipefail
# Default values
TARGET="stack"
DURATION=60
CONCURRENT=10
# Parse command line arguments
usage() {
echo "Usage: $0 [options]"
echo "Options:"
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
echo " -d, --duration <seconds> Duration in seconds (default: 60)"
echo " -c, --concurrent <users> Number of concurrent users (default: 10)"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
echo " $0 --target vllm # Benchmark vLLM direct"
echo " $0 --target stack # Benchmark Llama Stack (default)"
echo " $0 -t vllm -d 120 -c 20 # vLLM with 120s duration, 20 users"
}
while [[ $# -gt 0 ]]; do
case $1 in
-t|--target)
TARGET="$2"
shift 2
;;
-d|--duration)
DURATION="$2"
shift 2
;;
-c|--concurrent)
CONCURRENT="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown option: $1"
usage
exit 1
;;
esac
done
# Validate target
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
echo "Error: Target must be 'stack' or 'vllm'"
usage
exit 1
fi
# Set configuration based on target
if [[ "$TARGET" == "vllm" ]]; then
BASE_URL="http://vllm-server:8000/v1"
JOB_NAME="vllm-benchmark-job"
echo "Benchmarking vLLM direct..."
else
BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
JOB_NAME="stack-benchmark-job"
echo "Benchmarking Llama Stack..."
fi
echo "Configuration:"
echo " Target: $TARGET"
echo " Base URL: $BASE_URL"
echo " Duration: ${DURATION}s"
echo " Concurrent users: $CONCURRENT"
echo ""
# Create temporary job yaml
TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
cat > "$TEMP_YAML" << EOF
apiVersion: batch/v1
kind: Job
metadata:
name: $JOB_NAME
namespace: default
spec:
template:
spec:
containers:
- name: benchmark
image: python:3.11-slim
command: ["/bin/bash"]
args:
- "-c"
- |
pip install aiohttp &&
python3 /benchmark/benchmark.py \\
--base-url $BASE_URL \\
--model \${INFERENCE_MODEL} \\
--duration $DURATION \\
--concurrent $CONCURRENT
env:
- name: INFERENCE_MODEL
value: "meta-llama/Llama-3.2-3B-Instruct"
volumeMounts:
- name: benchmark-script
mountPath: /benchmark
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
volumes:
- name: benchmark-script
configMap:
name: benchmark-script
restartPolicy: Never
backoffLimit: 3
EOF
echo "Creating benchmark ConfigMap..."
kubectl create configmap benchmark-script \
--from-file=benchmark.py=benchmark.py \
--dry-run=client -o yaml | kubectl apply -f -
echo "Cleaning up any existing benchmark job..."
kubectl delete job $JOB_NAME 2>/dev/null || true
echo "Deploying benchmark Job..."
kubectl apply -f "$TEMP_YAML"
echo "Waiting for job to start..."
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
echo "Following benchmark logs..."
kubectl logs -f job/$JOB_NAME
echo "Job completed. Checking final status..."
kubectl get job $JOB_NAME
# Clean up temporary file
rm -f "$TEMP_YAML"

View file

@ -26,13 +26,6 @@ data:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: mock-vllm-inference
provider_type: remote::vllm
config:
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
max_tokens: 4096
api_token: fake
tls_verify: false
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
@ -121,9 +114,6 @@ data:
- model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
model_type: llm
- model_id: ${env.MOCK_INFERENCE_MODEL}
provider_id: mock-vllm-inference
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: []

View file

@ -44,8 +44,6 @@ spec:
value: "${SAFETY_MODEL}"
- name: TAVILY_SEARCH_API_KEY
value: "${TAVILY_SEARCH_API_KEY}"
- name: MOCK_INFERENCE_PORT
value: "${MOCK_INFERENCE_PORT}"
- name: VLLM_URL
value: http://vllm-server.default.svc.cluster.local:8000/v1
- name: VLLM_MAX_TOKENS
@ -54,8 +52,6 @@ spec:
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
- name: VLLM_TLS_VERIFY
value: "false"
- name: MOCK_INFERENCE_MODEL
value: "${MOCK_INFERENCE_MODEL}"
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
ports:
- containerPort: 8323

View file

@ -3,7 +3,6 @@ image_name: kubernetes-benchmark-demo
apis:
- agents
- inference
- safety
- telemetry
- tool_runtime
- vector_io
@ -16,20 +15,6 @@ providers:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: vllm-safety
provider_type: remote::vllm
config:
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: mock-vllm-inference
provider_type: remote::vllm
config:
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
max_tokens: 4096
api_token: fake
tls_verify: false
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
@ -45,11 +30,6 @@ providers:
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
@ -115,14 +95,6 @@ models:
- model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
model_type: llm
- model_id: ${env.MOCK_INFERENCE_MODEL}
provider_id: mock-vllm-inference
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: []
datasets: []
scoring_fns: []

View file

@ -2,6 +2,15 @@
## Overview
Agents API for creating and interacting with agentic systems.
Main functionalities provided by this API:
- Create agents with specific instructions and ability to use tools.
- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
- Agents can be provided with various shields (see the Safety API for more details).
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
This section contains documentation for all available providers for the **agents** API.
## Providers

View file

@ -0,0 +1,24 @@
# Batches
## Overview
The Batches API enables efficient processing of multiple requests in a single operation,
particularly useful for processing large datasets, batch evaluation workflows, and
cost-effective inference at scale.
The API is designed to allow use of openai client libraries for seamless integration.
This API provides the following extensions:
- idempotent batch creation
Note: This API is currently under active development and may undergo changes.
This section contains documentation for all available providers for the **batches** API.
## Providers
```{toctree}
:maxdepth: 1
inline_reference
```

View file

@ -0,0 +1,23 @@
# inline::reference
## Description
Reference implementation of batches API with KVStore persistence.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
| `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
| `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
## Sample Configuration
```yaml
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
```

View file

@ -2,6 +2,8 @@
## Overview
Llama Stack Evaluation API for running evaluations on model and agent candidates.
This section contains documentation for all available providers for the **eval** API.
## Providers

View file

@ -10,4 +10,5 @@ This section contains documentation for all available providers for the **files*
:maxdepth: 1
inline_localfs
remote_s3
```

View file

@ -0,0 +1,33 @@
# remote::s3
## Description
AWS S3-based file storage provider for scalable cloud file management with metadata persistence.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `bucket_name` | `<class 'str'>` | No | | S3 bucket name to store files |
| `region` | `<class 'str'>` | No | us-east-1 | AWS region where the bucket is located |
| `aws_access_key_id` | `str \| None` | No | | AWS access key ID (optional if using IAM roles) |
| `aws_secret_access_key` | `str \| None` | No | | AWS secret access key (optional if using IAM roles) |
| `endpoint_url` | `str \| None` | No | | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
| `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
## Sample Configuration
```yaml
bucket_name: ${env.S3_BUCKET_NAME}
region: ${env.AWS_REGION:=us-east-1}
aws_access_key_id: ${env.AWS_ACCESS_KEY_ID:=}
aws_secret_access_key: ${env.AWS_SECRET_ACCESS_KEY:=}
endpoint_url: ${env.S3_ENDPOINT_URL:=}
auto_create_bucket: ${env.S3_AUTO_CREATE_BUCKET:=false}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/s3_files_metadata.db
```

View file

@ -2,6 +2,12 @@
## Overview
Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate "raw" and "chat" (conversational) completions.
- Embedding models: these models generate embeddings to be used for semantic search.
This section contains documentation for all available providers for the **inference** API.
## Providers

View file

@ -9,7 +9,9 @@ This section contains documentation for all available providers for the **post_t
```{toctree}
:maxdepth: 1
inline_huggingface
inline_torchtune
inline_huggingface-cpu
inline_huggingface-gpu
inline_torchtune-cpu
inline_torchtune-gpu
remote_nvidia
```

View file

@ -0,0 +1,41 @@
# inline::huggingface-cpu
## Description
HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `device` | `<class 'str'>` | No | cuda | |
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
| `chat_template` | `<class 'str'>` | No | <|user|>
{input}
<|assistant|>
{output} | |
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
| `save_total_limit` | `<class 'int'>` | No | 3 | |
| `logging_steps` | `<class 'int'>` | No | 10 | |
| `warmup_ratio` | `<class 'float'>` | No | 0.1 | |
| `weight_decay` | `<class 'float'>` | No | 0.01 | |
| `dataloader_num_workers` | `<class 'int'>` | No | 4 | |
| `dataloader_pin_memory` | `<class 'bool'>` | No | True | |
| `dpo_beta` | `<class 'float'>` | No | 0.1 | |
| `use_reference_model` | `<class 'bool'>` | No | True | |
| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid | |
| `dpo_output_dir` | `<class 'str'>` | No | | |
## Sample Configuration
```yaml
checkpoint_format: huggingface
distributed_backend: null
device: cpu
dpo_output_dir: ~/.llama/dummy/dpo_output
```

View file

@ -0,0 +1,41 @@
# inline::huggingface-gpu
## Description
HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `device` | `<class 'str'>` | No | cuda | |
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
| `chat_template` | `<class 'str'>` | No | <|user|>
{input}
<|assistant|>
{output} | |
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
| `save_total_limit` | `<class 'int'>` | No | 3 | |
| `logging_steps` | `<class 'int'>` | No | 10 | |
| `warmup_ratio` | `<class 'float'>` | No | 0.1 | |
| `weight_decay` | `<class 'float'>` | No | 0.01 | |
| `dataloader_num_workers` | `<class 'int'>` | No | 4 | |
| `dataloader_pin_memory` | `<class 'bool'>` | No | True | |
| `dpo_beta` | `<class 'float'>` | No | 0.1 | |
| `use_reference_model` | `<class 'bool'>` | No | True | |
| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid | |
| `dpo_output_dir` | `<class 'str'>` | No | | |
## Sample Configuration
```yaml
checkpoint_format: huggingface
distributed_backend: null
device: cpu
dpo_output_dir: ~/.llama/dummy/dpo_output
```

View file

@ -0,0 +1,20 @@
# inline::torchtune-cpu
## Description
TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `torch_seed` | `int \| None` | No | | |
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
## Sample Configuration
```yaml
checkpoint_format: meta
```

View file

@ -0,0 +1,20 @@
# inline::torchtune-gpu
## Description
TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `torch_seed` | `int \| None` | No | | |
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
## Sample Configuration
```yaml
checkpoint_format: meta
```