Merge 9aef325934 into sapling-pr-archive-ehhuang

This commit is contained in:
ehhuang 2025-10-27 15:32:50 -07:00 committed by GitHub
commit e9a8967ed5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
41 changed files with 1280 additions and 197 deletions

View file

@ -9862,7 +9862,7 @@ components:
$ref: '#/components/schemas/RAGDocument' $ref: '#/components/schemas/RAGDocument'
description: >- description: >-
List of documents to index in the RAG system List of documents to index in the RAG system
vector_db_id: vector_store_id:
type: string type: string
description: >- description: >-
ID of the vector database to store the document embeddings ID of the vector database to store the document embeddings
@ -9873,7 +9873,7 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- documents - documents
- vector_db_id - vector_store_id
- chunk_size_in_tokens - chunk_size_in_tokens
title: InsertRequest title: InsertRequest
DefaultRAGQueryGeneratorConfig: DefaultRAGQueryGeneratorConfig:
@ -10044,7 +10044,7 @@ components:
$ref: '#/components/schemas/InterleavedContent' $ref: '#/components/schemas/InterleavedContent'
description: >- description: >-
The query content to search for in the indexed documents The query content to search for in the indexed documents
vector_db_ids: vector_store_ids:
type: array type: array
items: items:
type: string type: string
@ -10057,7 +10057,7 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- content - content
- vector_db_ids - vector_store_ids
title: QueryRequest title: QueryRequest
RAGQueryResult: RAGQueryResult:
type: object type: object
@ -10281,7 +10281,7 @@ components:
InsertChunksRequest: InsertChunksRequest:
type: object type: object
properties: properties:
vector_db_id: vector_store_id:
type: string type: string
description: >- description: >-
The identifier of the vector database to insert the chunks into. The identifier of the vector database to insert the chunks into.
@ -10300,13 +10300,13 @@ components:
description: The time to live of the chunks. description: The time to live of the chunks.
additionalProperties: false additionalProperties: false
required: required:
- vector_db_id - vector_store_id
- chunks - chunks
title: InsertChunksRequest title: InsertChunksRequest
QueryChunksRequest: QueryChunksRequest:
type: object type: object
properties: properties:
vector_db_id: vector_store_id:
type: string type: string
description: >- description: >-
The identifier of the vector database to query. The identifier of the vector database to query.
@ -10326,7 +10326,7 @@ components:
description: The parameters of the query. description: The parameters of the query.
additionalProperties: false additionalProperties: false
required: required:
- vector_db_id - vector_store_id
- query - query
title: QueryChunksRequest title: QueryChunksRequest
QueryChunksResponse: QueryChunksResponse:
@ -11844,7 +11844,7 @@ components:
description: Type of the step in an agent turn. description: Type of the step in an agent turn.
const: memory_retrieval const: memory_retrieval
default: memory_retrieval default: memory_retrieval
vector_db_ids: vector_store_ids:
type: string type: string
description: >- description: >-
The IDs of the vector databases to retrieve context from. The IDs of the vector databases to retrieve context from.
@ -11857,7 +11857,7 @@ components:
- turn_id - turn_id
- step_id - step_id
- step_type - step_type
- vector_db_ids - vector_store_ids
- inserted_context - inserted_context
title: MemoryRetrievalStep title: MemoryRetrievalStep
description: >- description: >-

View file

@ -72,14 +72,14 @@ description: |
Example with hybrid search: Example with hybrid search:
```python ```python
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7}, params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
) )
# Using RRF ranker # Using RRF ranker
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={ params={
"mode": "hybrid", "mode": "hybrid",
@ -91,7 +91,7 @@ description: |
# Using weighted ranker # Using weighted ranker
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={ params={
"mode": "hybrid", "mode": "hybrid",
@ -105,7 +105,7 @@ description: |
Example with explicit vector search: Example with explicit vector search:
```python ```python
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7}, params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
) )
@ -114,7 +114,7 @@ description: |
Example with keyword search: Example with keyword search:
```python ```python
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7}, params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
) )
@ -277,14 +277,14 @@ The SQLite-vec provider supports three search modes:
Example with hybrid search: Example with hybrid search:
```python ```python
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7}, params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
) )
# Using RRF ranker # Using RRF ranker
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={ params={
"mode": "hybrid", "mode": "hybrid",
@ -296,7 +296,7 @@ response = await vector_io.query_chunks(
# Using weighted ranker # Using weighted ranker
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={ params={
"mode": "hybrid", "mode": "hybrid",
@ -310,7 +310,7 @@ response = await vector_io.query_chunks(
Example with explicit vector search: Example with explicit vector search:
```python ```python
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7}, params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
) )
@ -319,7 +319,7 @@ response = await vector_io.query_chunks(
Example with keyword search: Example with keyword search:
```python ```python
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7}, params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
) )

View file

@ -4390,7 +4390,7 @@
"const": "memory_retrieval", "const": "memory_retrieval",
"default": "memory_retrieval" "default": "memory_retrieval"
}, },
"vector_db_ids": { "vector_store_ids": {
"type": "string", "type": "string",
"description": "The IDs of the vector databases to retrieve context from." "description": "The IDs of the vector databases to retrieve context from."
}, },
@ -4404,7 +4404,7 @@
"turn_id", "turn_id",
"step_id", "step_id",
"step_type", "step_type",
"vector_db_ids", "vector_store_ids",
"inserted_context" "inserted_context"
], ],
"title": "MemoryRetrievalStep", "title": "MemoryRetrievalStep",

View file

@ -3252,7 +3252,7 @@ components:
description: Type of the step in an agent turn. description: Type of the step in an agent turn.
const: memory_retrieval const: memory_retrieval
default: memory_retrieval default: memory_retrieval
vector_db_ids: vector_store_ids:
type: string type: string
description: >- description: >-
The IDs of the vector databases to retrieve context from. The IDs of the vector databases to retrieve context from.
@ -3265,7 +3265,7 @@ components:
- turn_id - turn_id
- step_id - step_id
- step_type - step_type
- vector_db_ids - vector_store_ids
- inserted_context - inserted_context
title: MemoryRetrievalStep title: MemoryRetrievalStep
description: >- description: >-

View file

@ -2865,7 +2865,7 @@
"const": "memory_retrieval", "const": "memory_retrieval",
"default": "memory_retrieval" "default": "memory_retrieval"
}, },
"vector_db_ids": { "vector_store_ids": {
"type": "string", "type": "string",
"description": "The IDs of the vector databases to retrieve context from." "description": "The IDs of the vector databases to retrieve context from."
}, },
@ -2879,7 +2879,7 @@
"turn_id", "turn_id",
"step_id", "step_id",
"step_type", "step_type",
"vector_db_ids", "vector_store_ids",
"inserted_context" "inserted_context"
], ],
"title": "MemoryRetrievalStep", "title": "MemoryRetrievalStep",

View file

@ -2085,7 +2085,7 @@ components:
description: Type of the step in an agent turn. description: Type of the step in an agent turn.
const: memory_retrieval const: memory_retrieval
default: memory_retrieval default: memory_retrieval
vector_db_ids: vector_store_ids:
type: string type: string
description: >- description: >-
The IDs of the vector databases to retrieve context from. The IDs of the vector databases to retrieve context from.
@ -2098,7 +2098,7 @@ components:
- turn_id - turn_id
- step_id - step_id
- step_type - step_type
- vector_db_ids - vector_store_ids
- inserted_context - inserted_context
title: MemoryRetrievalStep title: MemoryRetrievalStep
description: >- description: >-

View file

@ -11412,7 +11412,7 @@
}, },
"description": "List of documents to index in the RAG system" "description": "List of documents to index in the RAG system"
}, },
"vector_db_id": { "vector_store_id": {
"type": "string", "type": "string",
"description": "ID of the vector database to store the document embeddings" "description": "ID of the vector database to store the document embeddings"
}, },
@ -11424,7 +11424,7 @@
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"documents", "documents",
"vector_db_id", "vector_store_id",
"chunk_size_in_tokens" "chunk_size_in_tokens"
], ],
"title": "InsertRequest" "title": "InsertRequest"
@ -11615,7 +11615,7 @@
"$ref": "#/components/schemas/InterleavedContent", "$ref": "#/components/schemas/InterleavedContent",
"description": "The query content to search for in the indexed documents" "description": "The query content to search for in the indexed documents"
}, },
"vector_db_ids": { "vector_store_ids": {
"type": "array", "type": "array",
"items": { "items": {
"type": "string" "type": "string"
@ -11630,7 +11630,7 @@
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"content", "content",
"vector_db_ids" "vector_store_ids"
], ],
"title": "QueryRequest" "title": "QueryRequest"
}, },
@ -11923,7 +11923,7 @@
"InsertChunksRequest": { "InsertChunksRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
"vector_db_id": { "vector_store_id": {
"type": "string", "type": "string",
"description": "The identifier of the vector database to insert the chunks into." "description": "The identifier of the vector database to insert the chunks into."
}, },
@ -11941,7 +11941,7 @@
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"vector_db_id", "vector_store_id",
"chunks" "chunks"
], ],
"title": "InsertChunksRequest" "title": "InsertChunksRequest"
@ -11949,7 +11949,7 @@
"QueryChunksRequest": { "QueryChunksRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
"vector_db_id": { "vector_store_id": {
"type": "string", "type": "string",
"description": "The identifier of the vector database to query." "description": "The identifier of the vector database to query."
}, },
@ -11986,7 +11986,7 @@
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"vector_db_id", "vector_store_id",
"query" "query"
], ],
"title": "QueryChunksRequest" "title": "QueryChunksRequest"

View file

@ -8649,7 +8649,7 @@ components:
$ref: '#/components/schemas/RAGDocument' $ref: '#/components/schemas/RAGDocument'
description: >- description: >-
List of documents to index in the RAG system List of documents to index in the RAG system
vector_db_id: vector_store_id:
type: string type: string
description: >- description: >-
ID of the vector database to store the document embeddings ID of the vector database to store the document embeddings
@ -8660,7 +8660,7 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- documents - documents
- vector_db_id - vector_store_id
- chunk_size_in_tokens - chunk_size_in_tokens
title: InsertRequest title: InsertRequest
DefaultRAGQueryGeneratorConfig: DefaultRAGQueryGeneratorConfig:
@ -8831,7 +8831,7 @@ components:
$ref: '#/components/schemas/InterleavedContent' $ref: '#/components/schemas/InterleavedContent'
description: >- description: >-
The query content to search for in the indexed documents The query content to search for in the indexed documents
vector_db_ids: vector_store_ids:
type: array type: array
items: items:
type: string type: string
@ -8844,7 +8844,7 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- content - content
- vector_db_ids - vector_store_ids
title: QueryRequest title: QueryRequest
RAGQueryResult: RAGQueryResult:
type: object type: object
@ -9068,7 +9068,7 @@ components:
InsertChunksRequest: InsertChunksRequest:
type: object type: object
properties: properties:
vector_db_id: vector_store_id:
type: string type: string
description: >- description: >-
The identifier of the vector database to insert the chunks into. The identifier of the vector database to insert the chunks into.
@ -9087,13 +9087,13 @@ components:
description: The time to live of the chunks. description: The time to live of the chunks.
additionalProperties: false additionalProperties: false
required: required:
- vector_db_id - vector_store_id
- chunks - chunks
title: InsertChunksRequest title: InsertChunksRequest
QueryChunksRequest: QueryChunksRequest:
type: object type: object
properties: properties:
vector_db_id: vector_store_id:
type: string type: string
description: >- description: >-
The identifier of the vector database to query. The identifier of the vector database to query.
@ -9113,7 +9113,7 @@ components:
description: The parameters of the query. description: The parameters of the query.
additionalProperties: false additionalProperties: false
required: required:
- vector_db_id - vector_store_id
- query - query
title: QueryChunksRequest title: QueryChunksRequest
QueryChunksResponse: QueryChunksResponse:

View file

@ -13084,7 +13084,7 @@
}, },
"description": "List of documents to index in the RAG system" "description": "List of documents to index in the RAG system"
}, },
"vector_db_id": { "vector_store_id": {
"type": "string", "type": "string",
"description": "ID of the vector database to store the document embeddings" "description": "ID of the vector database to store the document embeddings"
}, },
@ -13096,7 +13096,7 @@
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"documents", "documents",
"vector_db_id", "vector_store_id",
"chunk_size_in_tokens" "chunk_size_in_tokens"
], ],
"title": "InsertRequest" "title": "InsertRequest"
@ -13287,7 +13287,7 @@
"$ref": "#/components/schemas/InterleavedContent", "$ref": "#/components/schemas/InterleavedContent",
"description": "The query content to search for in the indexed documents" "description": "The query content to search for in the indexed documents"
}, },
"vector_db_ids": { "vector_store_ids": {
"type": "array", "type": "array",
"items": { "items": {
"type": "string" "type": "string"
@ -13302,7 +13302,7 @@
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"content", "content",
"vector_db_ids" "vector_store_ids"
], ],
"title": "QueryRequest" "title": "QueryRequest"
}, },
@ -13595,7 +13595,7 @@
"InsertChunksRequest": { "InsertChunksRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
"vector_db_id": { "vector_store_id": {
"type": "string", "type": "string",
"description": "The identifier of the vector database to insert the chunks into." "description": "The identifier of the vector database to insert the chunks into."
}, },
@ -13613,7 +13613,7 @@
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"vector_db_id", "vector_store_id",
"chunks" "chunks"
], ],
"title": "InsertChunksRequest" "title": "InsertChunksRequest"
@ -13621,7 +13621,7 @@
"QueryChunksRequest": { "QueryChunksRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
"vector_db_id": { "vector_store_id": {
"type": "string", "type": "string",
"description": "The identifier of the vector database to query." "description": "The identifier of the vector database to query."
}, },
@ -13658,7 +13658,7 @@
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"vector_db_id", "vector_store_id",
"query" "query"
], ],
"title": "QueryChunksRequest" "title": "QueryChunksRequest"
@ -15719,7 +15719,7 @@
"const": "memory_retrieval", "const": "memory_retrieval",
"default": "memory_retrieval" "default": "memory_retrieval"
}, },
"vector_db_ids": { "vector_store_ids": {
"type": "string", "type": "string",
"description": "The IDs of the vector databases to retrieve context from." "description": "The IDs of the vector databases to retrieve context from."
}, },
@ -15733,7 +15733,7 @@
"turn_id", "turn_id",
"step_id", "step_id",
"step_type", "step_type",
"vector_db_ids", "vector_store_ids",
"inserted_context" "inserted_context"
], ],
"title": "MemoryRetrievalStep", "title": "MemoryRetrievalStep",

View file

@ -9862,7 +9862,7 @@ components:
$ref: '#/components/schemas/RAGDocument' $ref: '#/components/schemas/RAGDocument'
description: >- description: >-
List of documents to index in the RAG system List of documents to index in the RAG system
vector_db_id: vector_store_id:
type: string type: string
description: >- description: >-
ID of the vector database to store the document embeddings ID of the vector database to store the document embeddings
@ -9873,7 +9873,7 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- documents - documents
- vector_db_id - vector_store_id
- chunk_size_in_tokens - chunk_size_in_tokens
title: InsertRequest title: InsertRequest
DefaultRAGQueryGeneratorConfig: DefaultRAGQueryGeneratorConfig:
@ -10044,7 +10044,7 @@ components:
$ref: '#/components/schemas/InterleavedContent' $ref: '#/components/schemas/InterleavedContent'
description: >- description: >-
The query content to search for in the indexed documents The query content to search for in the indexed documents
vector_db_ids: vector_store_ids:
type: array type: array
items: items:
type: string type: string
@ -10057,7 +10057,7 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- content - content
- vector_db_ids - vector_store_ids
title: QueryRequest title: QueryRequest
RAGQueryResult: RAGQueryResult:
type: object type: object
@ -10281,7 +10281,7 @@ components:
InsertChunksRequest: InsertChunksRequest:
type: object type: object
properties: properties:
vector_db_id: vector_store_id:
type: string type: string
description: >- description: >-
The identifier of the vector database to insert the chunks into. The identifier of the vector database to insert the chunks into.
@ -10300,13 +10300,13 @@ components:
description: The time to live of the chunks. description: The time to live of the chunks.
additionalProperties: false additionalProperties: false
required: required:
- vector_db_id - vector_store_id
- chunks - chunks
title: InsertChunksRequest title: InsertChunksRequest
QueryChunksRequest: QueryChunksRequest:
type: object type: object
properties: properties:
vector_db_id: vector_store_id:
type: string type: string
description: >- description: >-
The identifier of the vector database to query. The identifier of the vector database to query.
@ -10326,7 +10326,7 @@ components:
description: The parameters of the query. description: The parameters of the query.
additionalProperties: false additionalProperties: false
required: required:
- vector_db_id - vector_store_id
- query - query
title: QueryChunksRequest title: QueryChunksRequest
QueryChunksResponse: QueryChunksResponse:
@ -11844,7 +11844,7 @@ components:
description: Type of the step in an agent turn. description: Type of the step in an agent turn.
const: memory_retrieval const: memory_retrieval
default: memory_retrieval default: memory_retrieval
vector_db_ids: vector_store_ids:
type: string type: string
description: >- description: >-
The IDs of the vector databases to retrieve context from. The IDs of the vector databases to retrieve context from.
@ -11857,7 +11857,7 @@ components:
- turn_id - turn_id
- step_id - step_id
- step_type - step_type
- vector_db_ids - vector_store_ids
- inserted_context - inserted_context
title: MemoryRetrievalStep title: MemoryRetrievalStep
description: >- description: >-

View file

@ -30,8 +30,10 @@ materialize_telemetry_configs() {
local otel_cfg="${dest}/otel-collector-config.yaml" local otel_cfg="${dest}/otel-collector-config.yaml"
local prom_cfg="${dest}/prometheus.yml" local prom_cfg="${dest}/prometheus.yml"
local graf_cfg="${dest}/grafana-datasources.yaml" local graf_cfg="${dest}/grafana-datasources.yaml"
local graf_dash_cfg="${dest}/grafana-dashboards.yaml"
local dash_json="${dest}/llama-stack-dashboard.json"
for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg"; do for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg" "$graf_dash_cfg" "$dash_json"; do
if [ -e "$asset" ]; then if [ -e "$asset" ]; then
die "Telemetry asset ${asset} already exists; refusing to overwrite" die "Telemetry asset ${asset} already exists; refusing to overwrite"
fi fi
@ -103,6 +105,7 @@ datasources:
type: prometheus type: prometheus
access: proxy access: proxy
url: http://prometheus:9090 url: http://prometheus:9090
uid: prometheus
isDefault: true isDefault: true
editable: true editable: true
@ -112,6 +115,224 @@ datasources:
url: http://jaeger:16686 url: http://jaeger:16686
editable: true editable: true
EOF EOF
cat <<'EOF' > "$graf_dash_cfg"
apiVersion: 1
providers:
- name: 'Llama Stack'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards
EOF
# Copy the dashboard JSON inline to avoid line-length issues
cat > "$dash_json" <<'DASHBOARD_JSON'
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"showPoints": "auto",
"fillOpacity": 10
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{"color": "green", "value": null}]
}
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"id": 1,
"options": {
"legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "none"}
},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "llama_stack_completion_tokens_total",
"legendFormat": "{{model_id}} ({{provider_id}})",
"refId": "A"
}
],
"title": "Completion Tokens",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"id": 2,
"options": {
"legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "none"}
},
"targets": [
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_prompt_tokens_total", "legendFormat": "Prompt - {{model_id}}", "refId": "A"},
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_tokens_total", "legendFormat": "Total - {{model_id}}", "refId": "B"}
],
"title": "Prompt & Total Tokens",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "ms"
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"id": 3,
"options": {
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "none"}
},
"targets": [
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p95", "refId": "A"},
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p99", "refId": "B"}
],
"title": "HTTP Request Duration (p95, p99)",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
}
},
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 8},
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
"textMode": "auto"
},
"targets": [
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_duration_milliseconds_count)", "refId": "A"}
],
"title": "Total Requests",
"type": "stat"
},
{
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
}
},
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 8},
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
"textMode": "auto"
},
"targets": [
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_active_requests)", "refId": "A"}
],
"title": "Active Requests",
"type": "stat"
},
{
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "reqps"
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"id": 6,
"options": {
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "none"}
},
"targets": [
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])", "legendFormat": "{{http_target}} - {{http_status_code}}", "refId": "A"}
],
"title": "Request Rate",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
"unit": "Bps"
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
"id": 7,
"options": {
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
"tooltip": {"mode": "multi", "sort": "none"}
},
"targets": [
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])", "legendFormat": "Request", "refId": "A"},
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])", "legendFormat": "Response", "refId": "B"}
],
"title": "Request/Response Sizes",
"type": "timeseries"
}
],
"refresh": "5s",
"schemaVersion": 38,
"tags": ["llama-stack"],
"templating": {"list": []},
"time": {"from": "now-15m", "to": "now"},
"timepicker": {},
"timezone": "browser",
"title": "Llama Stack Metrics",
"uid": "llama-stack-metrics",
"version": 0,
"weekStart": ""
}
DASHBOARD_JSON
} }
# Cleanup function to remove temporary files # Cleanup function to remove temporary files
@ -372,6 +593,8 @@ if [ "$WITH_TELEMETRY" = true ]; then
-e GF_SECURITY_ADMIN_PASSWORD=admin \ -e GF_SECURITY_ADMIN_PASSWORD=admin \
-e GF_USERS_ALLOW_SIGN_UP=false \ -e GF_USERS_ALLOW_SIGN_UP=false \
-v "${TELEMETRY_ASSETS_DIR}/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \ -v "${TELEMETRY_ASSETS_DIR}/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
-v "${TELEMETRY_ASSETS_DIR}/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \
-v "${TELEMETRY_ASSETS_DIR}/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \
docker.io/grafana/grafana:11.0.0 > /dev/null 2>&1; then docker.io/grafana/grafana:11.0.0 > /dev/null 2>&1; then
die "Grafana startup failed" die "Grafana startup failed"
fi fi

View file

@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: 'Llama Stack'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards

View file

@ -5,6 +5,7 @@ datasources:
type: prometheus type: prometheus
access: proxy access: proxy
url: http://prometheus:9090 url: http://prometheus:9090
uid: prometheus
isDefault: true isDefault: true
editable: true editable: true

View file

@ -0,0 +1,457 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"showPoints": "auto",
"fillOpacity": 10
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "llama_stack_completion_tokens_total",
"legendFormat": "{{model_id}} ({{provider_id}})",
"refId": "A"
}
],
"title": "Completion Tokens",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"showPoints": "auto",
"fillOpacity": 10
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "llama_stack_prompt_tokens_total",
"legendFormat": "Prompt - {{model_id}}",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "llama_stack_tokens_total",
"legendFormat": "Total - {{model_id}}",
"refId": "B"
}
],
"title": "Prompt & Total Tokens",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"showPoints": "auto",
"fillOpacity": 10
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "ms"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))",
"legendFormat": "p95",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))",
"legendFormat": "p99",
"refId": "B"
}
],
"title": "HTTP Request Duration (p95, p99)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
}
},
"gridPos": {
"h": 8,
"w": 6,
"x": 12,
"y": 8
},
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(llama_stack_http_server_duration_milliseconds_count)",
"refId": "A"
}
],
"title": "Total Requests",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
}
}
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 8
},
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(llama_stack_http_server_active_requests)",
"refId": "A"
}
],
"title": "Active Requests",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"showPoints": "auto",
"fillOpacity": 10
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "reqps"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])",
"legendFormat": "{{http_target}} - {{http_status_code}}",
"refId": "A"
}
],
"title": "Request Rate",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"showPoints": "auto",
"fillOpacity": 10
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "Bps"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 7,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])",
"legendFormat": "Request",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])",
"legendFormat": "Response",
"refId": "B"
}
],
"title": "Request/Response Sizes",
"type": "timeseries"
}
],
"refresh": "5s",
"schemaVersion": 38,
"tags": [
"llama-stack"
],
"templating": {
"list": []
},
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Llama Stack Metrics",
"uid": "llama-stack-metrics",
"version": 0,
"weekStart": ""
}

View file

@ -135,6 +135,8 @@ $CONTAINER_RUNTIME run -d --name grafana \
-e GF_SECURITY_ADMIN_PASSWORD=admin \ -e GF_SECURITY_ADMIN_PASSWORD=admin \
-e GF_USERS_ALLOW_SIGN_UP=false \ -e GF_USERS_ALLOW_SIGN_UP=false \
-v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \ -v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
-v "$SCRIPT_DIR/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \
-v "$SCRIPT_DIR/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \
docker.io/grafana/grafana:11.0.0 docker.io/grafana/grafana:11.0.0
# Wait for services to start # Wait for services to start

View file

@ -149,13 +149,13 @@ class ShieldCallStep(StepCommon):
class MemoryRetrievalStep(StepCommon): class MemoryRetrievalStep(StepCommon):
"""A memory retrieval step in an agent turn. """A memory retrieval step in an agent turn.
:param vector_db_ids: The IDs of the vector databases to retrieve context from. :param vector_store_ids: The IDs of the vector databases to retrieve context from.
:param inserted_context: The context retrieved from the vector databases. :param inserted_context: The context retrieved from the vector databases.
""" """
step_type: Literal[StepType.memory_retrieval] = StepType.memory_retrieval step_type: Literal[StepType.memory_retrieval] = StepType.memory_retrieval
# TODO: should this be List[str]? # TODO: should this be List[str]?
vector_db_ids: str vector_store_ids: str
inserted_context: InterleavedContent inserted_context: InterleavedContent

View file

@ -21,8 +21,8 @@ from typing_extensions import TypedDict
from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
from llama_stack.apis.common.responses import Order from llama_stack.apis.common.responses import Order
from llama_stack.apis.models import Model from llama_stack.apis.models import Model
from llama_stack.apis.telemetry import MetricResponseMixin
from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
from llama_stack.core.telemetry.telemetry import MetricResponseMixin
from llama_stack.core.telemetry.trace_protocol import trace_protocol from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.models.llama.datatypes import ( from llama_stack.models.llama.datatypes import (
BuiltinTool, BuiltinTool,

View file

@ -190,13 +190,13 @@ class RAGToolRuntime(Protocol):
async def insert( async def insert(
self, self,
documents: list[RAGDocument], documents: list[RAGDocument],
vector_db_id: str, vector_store_id: str,
chunk_size_in_tokens: int = 512, chunk_size_in_tokens: int = 512,
) -> None: ) -> None:
"""Index documents so they can be used by the RAG system. """Index documents so they can be used by the RAG system.
:param documents: List of documents to index in the RAG system :param documents: List of documents to index in the RAG system
:param vector_db_id: ID of the vector database to store the document embeddings :param vector_store_id: ID of the vector database to store the document embeddings
:param chunk_size_in_tokens: (Optional) Size in tokens for document chunking during indexing :param chunk_size_in_tokens: (Optional) Size in tokens for document chunking during indexing
""" """
... ...
@ -205,13 +205,13 @@ class RAGToolRuntime(Protocol):
async def query( async def query(
self, self,
content: InterleavedContent, content: InterleavedContent,
vector_db_ids: list[str], vector_store_ids: list[str],
query_config: RAGQueryConfig | None = None, query_config: RAGQueryConfig | None = None,
) -> RAGQueryResult: ) -> RAGQueryResult:
"""Query the RAG system for context; typically invoked by the agent. """Query the RAG system for context; typically invoked by the agent.
:param content: The query content to search for in the indexed documents :param content: The query content to search for in the indexed documents
:param vector_db_ids: List of vector database IDs to search within :param vector_store_ids: List of vector database IDs to search within
:param query_config: (Optional) Configuration parameters for the query operation :param query_config: (Optional) Configuration parameters for the query operation
:returns: RAGQueryResult containing the retrieved content and metadata :returns: RAGQueryResult containing the retrieved content and metadata
""" """

View file

@ -529,17 +529,17 @@ class VectorIO(Protocol):
# this will just block now until chunks are inserted, but it should # this will just block now until chunks are inserted, but it should
# probably return a Job instance which can be polled for completion # probably return a Job instance which can be polled for completion
# TODO: rename vector_db_id to vector_store_id once Stainless is working # TODO: rename vector_store_id to vector_store_id once Stainless is working
@webmethod(route="/vector-io/insert", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/vector-io/insert", method="POST", level=LLAMA_STACK_API_V1)
async def insert_chunks( async def insert_chunks(
self, self,
vector_db_id: str, vector_store_id: str,
chunks: list[Chunk], chunks: list[Chunk],
ttl_seconds: int | None = None, ttl_seconds: int | None = None,
) -> None: ) -> None:
"""Insert chunks into a vector database. """Insert chunks into a vector database.
:param vector_db_id: The identifier of the vector database to insert the chunks into. :param vector_store_id: The identifier of the vector database to insert the chunks into.
:param chunks: The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types. :param chunks: The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types.
`metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional. `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional.
If `metadata` is provided, you configure how Llama Stack formats the chunk during generation. If `metadata` is provided, you configure how Llama Stack formats the chunk during generation.
@ -548,17 +548,17 @@ class VectorIO(Protocol):
""" """
... ...
# TODO: rename vector_db_id to vector_store_id once Stainless is working # TODO: rename vector_store_id to vector_store_id once Stainless is working
@webmethod(route="/vector-io/query", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/vector-io/query", method="POST", level=LLAMA_STACK_API_V1)
async def query_chunks( async def query_chunks(
self, self,
vector_db_id: str, vector_store_id: str,
query: InterleavedContent, query: InterleavedContent,
params: dict[str, Any] | None = None, params: dict[str, Any] | None = None,
) -> QueryChunksResponse: ) -> QueryChunksResponse:
"""Query chunks from a vector database. """Query chunks from a vector database.
:param vector_db_id: The identifier of the vector database to query. :param vector_store_id: The identifier of the vector database to query.
:param query: The query to search for. :param query: The query to search for.
:param params: The parameters of the query. :param params: The parameters of the query.
:returns: A QueryChunksResponse. :returns: A QueryChunksResponse.

View file

@ -312,3 +312,6 @@ class ConversationServiceImpl(Conversations):
logger.debug(f"Deleted item {item_id} from conversation {conversation_id}") logger.debug(f"Deleted item {item_id} from conversation {conversation_id}")
return ConversationItemDeletedResource(id=item_id) return ConversationItemDeletedResource(id=item_id)
async def shutdown(self) -> None:
pass

View file

@ -230,3 +230,6 @@ class PromptServiceImpl(Prompts):
await self.kvstore.set(default_key, str(version)) await self.kvstore.set(default_key, str(version))
return self._deserialize_prompt(data) return self._deserialize_prompt(data)
async def shutdown(self) -> None:
pass

View file

@ -53,7 +53,7 @@ from llama_stack.apis.inference.inference import (
OpenAIChatCompletionContentPartTextParam, OpenAIChatCompletionContentPartTextParam,
) )
from llama_stack.apis.models import Model, ModelType from llama_stack.apis.models import Model, ModelType
from llama_stack.apis.telemetry import MetricEvent, MetricInResponse from llama_stack.core.telemetry.telemetry import MetricEvent, MetricInResponse
from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.models.llama.llama3.chat_format import ChatFormat from llama_stack.models.llama.llama3.chat_format import ChatFormat

View file

@ -73,27 +73,27 @@ class VectorIORouter(VectorIO):
async def insert_chunks( async def insert_chunks(
self, self,
vector_db_id: str, vector_store_id: str,
chunks: list[Chunk], chunks: list[Chunk],
ttl_seconds: int | None = None, ttl_seconds: int | None = None,
) -> None: ) -> None:
doc_ids = [chunk.document_id for chunk in chunks[:3]] doc_ids = [chunk.document_id for chunk in chunks[:3]]
logger.debug( logger.debug(
f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, " f"VectorIORouter.insert_chunks: {vector_store_id}, {len(chunks)} chunks, "
f"ttl_seconds={ttl_seconds}, chunk_ids={doc_ids}{' and more...' if len(chunks) > 3 else ''}" f"ttl_seconds={ttl_seconds}, chunk_ids={doc_ids}{' and more...' if len(chunks) > 3 else ''}"
) )
provider = await self.routing_table.get_provider_impl(vector_db_id) provider = await self.routing_table.get_provider_impl(vector_store_id)
return await provider.insert_chunks(vector_db_id, chunks, ttl_seconds) return await provider.insert_chunks(vector_store_id, chunks, ttl_seconds)
async def query_chunks( async def query_chunks(
self, self,
vector_db_id: str, vector_store_id: str,
query: InterleavedContent, query: InterleavedContent,
params: dict[str, Any] | None = None, params: dict[str, Any] | None = None,
) -> QueryChunksResponse: ) -> QueryChunksResponse:
logger.debug(f"VectorIORouter.query_chunks: {vector_db_id}") logger.debug(f"VectorIORouter.query_chunks: {vector_store_id}")
provider = await self.routing_table.get_provider_impl(vector_db_id) provider = await self.routing_table.get_provider_impl(vector_store_id)
return await provider.query_chunks(vector_db_id, query, params) return await provider.query_chunks(vector_store_id, query, params)
# OpenAI Vector Stores API endpoints # OpenAI Vector Stores API endpoints
async def openai_create_vector_store( async def openai_create_vector_store(

View file

@ -31,7 +31,6 @@ from llama_stack.apis.scoring import Scoring
from llama_stack.apis.scoring_functions import ScoringFunctions from llama_stack.apis.scoring_functions import ScoringFunctions
from llama_stack.apis.shields import Shields from llama_stack.apis.shields import Shields
from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration
from llama_stack.apis.telemetry import Telemetry
from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime
from llama_stack.apis.vector_io import VectorIO from llama_stack.apis.vector_io import VectorIO
from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl
@ -67,7 +66,6 @@ class LlamaStack(
Safety, Safety,
SyntheticDataGeneration, SyntheticDataGeneration,
Datasets, Datasets,
Telemetry,
PostTraining, PostTraining,
VectorIO, VectorIO,
Eval, Eval,

View file

@ -6,7 +6,13 @@
import os import os
import threading import threading
from typing import Any from datetime import datetime
from enum import Enum
from typing import (
Annotated,
Any,
Literal,
)
from opentelemetry import metrics, trace from opentelemetry import metrics, trace
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
@ -16,21 +22,399 @@ from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
from pydantic import BaseModel, Field
from llama_stack.apis.telemetry import (
Event,
MetricEvent,
SpanEndPayload,
SpanStartPayload,
SpanStatus,
StructuredLogEvent,
UnstructuredLogEvent,
)
from llama_stack.apis.telemetry import (
Telemetry as TelemetryBase,
)
from llama_stack.core.telemetry.tracing import ROOT_SPAN_MARKERS
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.models.llama.datatypes import Primitive
from llama_stack.schema_utils import json_schema_type, register_schema
ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]
@json_schema_type
class SpanStatus(Enum):
"""The status of a span indicating whether it completed successfully or with an error.
:cvar OK: Span completed successfully without errors
:cvar ERROR: Span completed with an error or failure
"""
OK = "ok"
ERROR = "error"
@json_schema_type
class Span(BaseModel):
"""A span representing a single operation within a trace.
:param span_id: Unique identifier for the span
:param trace_id: Unique identifier for the trace this span belongs to
:param parent_span_id: (Optional) Unique identifier for the parent span, if this is a child span
:param name: Human-readable name describing the operation this span represents
:param start_time: Timestamp when the operation began
:param end_time: (Optional) Timestamp when the operation finished, if completed
:param attributes: (Optional) Key-value pairs containing additional metadata about the span
"""
span_id: str
trace_id: str
parent_span_id: str | None = None
name: str
start_time: datetime
end_time: datetime | None = None
attributes: dict[str, Any] | None = Field(default_factory=lambda: {})
def set_attribute(self, key: str, value: Any):
if self.attributes is None:
self.attributes = {}
self.attributes[key] = value
@json_schema_type
class Trace(BaseModel):
"""A trace representing the complete execution path of a request across multiple operations.
:param trace_id: Unique identifier for the trace
:param root_span_id: Unique identifier for the root span that started this trace
:param start_time: Timestamp when the trace began
:param end_time: (Optional) Timestamp when the trace finished, if completed
"""
trace_id: str
root_span_id: str
start_time: datetime
end_time: datetime | None = None
@json_schema_type
class EventType(Enum):
"""The type of telemetry event being logged.
:cvar UNSTRUCTURED_LOG: A simple log message with severity level
:cvar STRUCTURED_LOG: A structured log event with typed payload data
:cvar METRIC: A metric measurement with value and unit
"""
UNSTRUCTURED_LOG = "unstructured_log"
STRUCTURED_LOG = "structured_log"
METRIC = "metric"
@json_schema_type
class LogSeverity(Enum):
"""The severity level of a log message.
:cvar VERBOSE: Detailed diagnostic information for troubleshooting
:cvar DEBUG: Debug information useful during development
:cvar INFO: General informational messages about normal operation
:cvar WARN: Warning messages about potentially problematic situations
:cvar ERROR: Error messages indicating failures that don't stop execution
:cvar CRITICAL: Critical error messages indicating severe failures
"""
VERBOSE = "verbose"
DEBUG = "debug"
INFO = "info"
WARN = "warn"
ERROR = "error"
CRITICAL = "critical"
class EventCommon(BaseModel):
"""Common fields shared by all telemetry events.
:param trace_id: Unique identifier for the trace this event belongs to
:param span_id: Unique identifier for the span this event belongs to
:param timestamp: Timestamp when the event occurred
:param attributes: (Optional) Key-value pairs containing additional metadata about the event
"""
trace_id: str
span_id: str
timestamp: datetime
attributes: dict[str, Primitive] | None = Field(default_factory=lambda: {})
@json_schema_type
class UnstructuredLogEvent(EventCommon):
"""An unstructured log event containing a simple text message.
:param type: Event type identifier set to UNSTRUCTURED_LOG
:param message: The log message text
:param severity: The severity level of the log message
"""
type: Literal[EventType.UNSTRUCTURED_LOG] = EventType.UNSTRUCTURED_LOG
message: str
severity: LogSeverity
@json_schema_type
class MetricEvent(EventCommon):
"""A metric event containing a measured value.
:param type: Event type identifier set to METRIC
:param metric: The name of the metric being measured
:param value: The numeric value of the metric measurement
:param unit: The unit of measurement for the metric value
"""
type: Literal[EventType.METRIC] = EventType.METRIC
metric: str # this would be an enum
value: int | float
unit: str
@json_schema_type
class MetricInResponse(BaseModel):
"""A metric value included in API responses.
:param metric: The name of the metric
:param value: The numeric value of the metric
:param unit: (Optional) The unit of measurement for the metric value
"""
metric: str
value: int | float
unit: str | None = None
# This is a short term solution to allow inference API to return metrics
# The ideal way to do this is to have a way for all response types to include metrics
# and all metric events logged to the telemetry API to be included with the response
# To do this, we will need to augment all response types with a metrics field.
# We have hit a blocker from stainless SDK that prevents us from doing this.
# The blocker is that if we were to augment the response types that have a data field
# in them like so
# class ListModelsResponse(BaseModel):
# metrics: Optional[List[MetricEvent]] = None
# data: List[Models]
# ...
# The client SDK will need to access the data by using a .data field, which is not
# ergonomic. Stainless SDK does support unwrapping the response type, but it
# requires that the response type to only have a single field.
# We will need a way in the client SDK to signal that the metrics are needed
# and if they are needed, the client SDK has to return the full response type
# without unwrapping it.
class MetricResponseMixin(BaseModel):
"""Mixin class for API responses that can include metrics.
:param metrics: (Optional) List of metrics associated with the API response
"""
metrics: list[MetricInResponse] | None = None
@json_schema_type
class StructuredLogType(Enum):
"""The type of structured log event payload.
:cvar SPAN_START: Event indicating the start of a new span
:cvar SPAN_END: Event indicating the completion of a span
"""
SPAN_START = "span_start"
SPAN_END = "span_end"
@json_schema_type
class SpanStartPayload(BaseModel):
"""Payload for a span start event.
:param type: Payload type identifier set to SPAN_START
:param name: Human-readable name describing the operation this span represents
:param parent_span_id: (Optional) Unique identifier for the parent span, if this is a child span
"""
type: Literal[StructuredLogType.SPAN_START] = StructuredLogType.SPAN_START
name: str
parent_span_id: str | None = None
@json_schema_type
class SpanEndPayload(BaseModel):
"""Payload for a span end event.
:param type: Payload type identifier set to SPAN_END
:param status: The final status of the span indicating success or failure
"""
type: Literal[StructuredLogType.SPAN_END] = StructuredLogType.SPAN_END
status: SpanStatus
StructuredLogPayload = Annotated[
SpanStartPayload | SpanEndPayload,
Field(discriminator="type"),
]
register_schema(StructuredLogPayload, name="StructuredLogPayload")
@json_schema_type
class StructuredLogEvent(EventCommon):
"""A structured log event containing typed payload data.
:param type: Event type identifier set to STRUCTURED_LOG
:param payload: The structured payload data for the log event
"""
type: Literal[EventType.STRUCTURED_LOG] = EventType.STRUCTURED_LOG
payload: StructuredLogPayload
Event = Annotated[
UnstructuredLogEvent | MetricEvent | StructuredLogEvent,
Field(discriminator="type"),
]
register_schema(Event, name="Event")
@json_schema_type
class EvalTrace(BaseModel):
"""A trace record for evaluation purposes.
:param session_id: Unique identifier for the evaluation session
:param step: The evaluation step or phase identifier
:param input: The input data for the evaluation
:param output: The actual output produced during evaluation
:param expected_output: The expected output for comparison during evaluation
"""
session_id: str
step: str
input: str
output: str
expected_output: str
@json_schema_type
class SpanWithStatus(Span):
"""A span that includes status information.
:param status: (Optional) The current status of the span
"""
status: SpanStatus | None = None
@json_schema_type
class QueryConditionOp(Enum):
"""Comparison operators for query conditions.
:cvar EQ: Equal to comparison
:cvar NE: Not equal to comparison
:cvar GT: Greater than comparison
:cvar LT: Less than comparison
"""
EQ = "eq"
NE = "ne"
GT = "gt"
LT = "lt"
@json_schema_type
class QueryCondition(BaseModel):
"""A condition for filtering query results.
:param key: The attribute key to filter on
:param op: The comparison operator to apply
:param value: The value to compare against
"""
key: str
op: QueryConditionOp
value: Any
class QueryTracesResponse(BaseModel):
"""Response containing a list of traces.
:param data: List of traces matching the query criteria
"""
data: list[Trace]
class QuerySpansResponse(BaseModel):
"""Response containing a list of spans.
:param data: List of spans matching the query criteria
"""
data: list[Span]
class QuerySpanTreeResponse(BaseModel):
"""Response containing a tree structure of spans.
:param data: Dictionary mapping span IDs to spans with status information
"""
data: dict[str, SpanWithStatus]
class MetricQueryType(Enum):
"""The type of metric query to perform.
:cvar RANGE: Query metrics over a time range
:cvar INSTANT: Query metrics at a specific point in time
"""
RANGE = "range"
INSTANT = "instant"
class MetricLabelOperator(Enum):
"""Operators for matching metric labels.
:cvar EQUALS: Label value must equal the specified value
:cvar NOT_EQUALS: Label value must not equal the specified value
:cvar REGEX_MATCH: Label value must match the specified regular expression
:cvar REGEX_NOT_MATCH: Label value must not match the specified regular expression
"""
EQUALS = "="
NOT_EQUALS = "!="
REGEX_MATCH = "=~"
REGEX_NOT_MATCH = "!~"
class MetricLabelMatcher(BaseModel):
"""A matcher for filtering metrics by label values.
:param name: The name of the label to match
:param value: The value to match against
:param operator: The comparison operator to use for matching
"""
name: str
value: str
operator: MetricLabelOperator = MetricLabelOperator.EQUALS
@json_schema_type
class MetricLabel(BaseModel):
"""A label associated with a metric.
:param name: The name of the label
:param value: The value of the label
"""
name: str
value: str
@json_schema_type
class MetricDataPoint(BaseModel):
"""A single data point in a metric time series.
:param timestamp: Unix timestamp when the metric value was recorded
:param value: The numeric value of the metric at this timestamp
"""
timestamp: int
value: float
unit: str
@json_schema_type
class MetricSeries(BaseModel):
"""A time series of metric data points.
:param metric: The name of the metric
:param labels: List of labels associated with this metric series
:param values: List of data points in chronological order
"""
metric: str
labels: list[MetricLabel]
values: list[MetricDataPoint]
class QueryMetricsResponse(BaseModel):
"""Response containing metric time series data.
:param data: List of metric series matching the query criteria
"""
data: list[MetricSeries]
_GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = { _GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = {
"active_spans": {}, "active_spans": {},
@ -49,7 +433,7 @@ def is_tracing_enabled(tracer):
return span.is_recording() return span.is_recording()
class Telemetry(TelemetryBase): class Telemetry:
def __init__(self) -> None: def __init__(self) -> None:
self.meter = None self.meter = None

View file

@ -17,7 +17,8 @@ from datetime import UTC, datetime
from functools import wraps from functools import wraps
from typing import Any, Self from typing import Any, Self
from llama_stack.apis.telemetry import ( from llama_stack.core.telemetry.telemetry import (
ROOT_SPAN_MARKERS,
Event, Event,
LogSeverity, LogSeverity,
Span, Span,
@ -47,7 +48,6 @@ if not _fallback_logger.handlers:
INVALID_SPAN_ID = 0x0000000000000000 INVALID_SPAN_ID = 0x0000000000000000
INVALID_TRACE_ID = 0x00000000000000000000000000000000 INVALID_TRACE_ID = 0x00000000000000000000000000000000
ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]
# The logical root span may not be visible to this process if a parent context # The logical root span may not be visible to this process if a parent context
# is passed in. The local root span is the first local span in a trace. # is passed in. The local root span is the first local span in a trace.
LOCAL_ROOT_SPAN_MARKER = "__local_root_span__" LOCAL_ROOT_SPAN_MARKER = "__local_root_span__"

View file

@ -488,13 +488,13 @@ class ChatAgent(ShieldRunnerMixin):
session_info = await self.storage.get_session_info(session_id) session_info = await self.storage.get_session_info(session_id)
# if the session has a memory bank id, let the memory tool use it # if the session has a memory bank id, let the memory tool use it
if session_info and session_info.vector_db_id: if session_info and session_info.vector_store_id:
for tool_name in self.tool_name_to_args.keys(): for tool_name in self.tool_name_to_args.keys():
if tool_name == MEMORY_QUERY_TOOL: if tool_name == MEMORY_QUERY_TOOL:
if "vector_db_ids" not in self.tool_name_to_args[tool_name]: if "vector_store_ids" not in self.tool_name_to_args[tool_name]:
self.tool_name_to_args[tool_name]["vector_db_ids"] = [session_info.vector_db_id] self.tool_name_to_args[tool_name]["vector_store_ids"] = [session_info.vector_store_id]
else: else:
self.tool_name_to_args[tool_name]["vector_db_ids"].append(session_info.vector_db_id) self.tool_name_to_args[tool_name]["vector_store_ids"].append(session_info.vector_store_id)
output_attachments = [] output_attachments = []

View file

@ -22,7 +22,7 @@ log = get_logger(name=__name__, category="agents::meta_reference")
class AgentSessionInfo(Session): class AgentSessionInfo(Session):
# TODO: is this used anywhere? # TODO: is this used anywhere?
vector_db_id: str | None = None vector_store_id: str | None = None
started_at: datetime started_at: datetime
owner: User | None = None owner: User | None = None
identifier: str | None = None identifier: str | None = None
@ -93,12 +93,12 @@ class AgentPersistence:
return session_info return session_info
async def add_vector_db_to_session(self, session_id: str, vector_db_id: str): async def add_vector_db_to_session(self, session_id: str, vector_store_id: str):
session_info = await self.get_session_if_accessible(session_id) session_info = await self.get_session_if_accessible(session_id)
if session_info is None: if session_info is None:
raise SessionNotFoundError(session_id) raise SessionNotFoundError(session_id)
session_info.vector_db_id = vector_db_id session_info.vector_store_id = vector_store_id
await self.kvstore.set( await self.kvstore.set(
key=f"session:{self.agent_id}:{session_id}", key=f"session:{self.agent_id}:{session_id}",
value=session_info.model_dump_json(), value=session_info.model_dump_json(),

View file

@ -119,7 +119,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
async def insert( async def insert(
self, self,
documents: list[RAGDocument], documents: list[RAGDocument],
vector_db_id: str, vector_store_id: str,
chunk_size_in_tokens: int = 512, chunk_size_in_tokens: int = 512,
) -> None: ) -> None:
if not documents: if not documents:
@ -158,14 +158,14 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
try: try:
await self.vector_io_api.openai_attach_file_to_vector_store( await self.vector_io_api.openai_attach_file_to_vector_store(
vector_store_id=vector_db_id, vector_store_id=vector_store_id,
file_id=created_file.id, file_id=created_file.id,
attributes=doc.metadata, attributes=doc.metadata,
chunking_strategy=chunking_strategy, chunking_strategy=chunking_strategy,
) )
except Exception as e: except Exception as e:
log.error( log.error(
f"Failed to attach file {created_file.id} to vector store {vector_db_id} for document {doc.document_id}: {e}" f"Failed to attach file {created_file.id} to vector store {vector_store_id} for document {doc.document_id}: {e}"
) )
continue continue
@ -176,10 +176,10 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
async def query( async def query(
self, self,
content: InterleavedContent, content: InterleavedContent,
vector_db_ids: list[str], vector_store_ids: list[str],
query_config: RAGQueryConfig | None = None, query_config: RAGQueryConfig | None = None,
) -> RAGQueryResult: ) -> RAGQueryResult:
if not vector_db_ids: if not vector_store_ids:
raise ValueError( raise ValueError(
"No vector DBs were provided to the knowledge search tool. Please provide at least one vector DB ID." "No vector DBs were provided to the knowledge search tool. Please provide at least one vector DB ID."
) )
@ -192,7 +192,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
) )
tasks = [ tasks = [
self.vector_io_api.query_chunks( self.vector_io_api.query_chunks(
vector_db_id=vector_db_id, vector_store_id=vector_store_id,
query=query, query=query,
params={ params={
"mode": query_config.mode, "mode": query_config.mode,
@ -201,18 +201,18 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
"ranker": query_config.ranker, "ranker": query_config.ranker,
}, },
) )
for vector_db_id in vector_db_ids for vector_store_id in vector_store_ids
] ]
results: list[QueryChunksResponse] = await asyncio.gather(*tasks) results: list[QueryChunksResponse] = await asyncio.gather(*tasks)
chunks = [] chunks = []
scores = [] scores = []
for vector_db_id, result in zip(vector_db_ids, results, strict=False): for vector_store_id, result in zip(vector_store_ids, results, strict=False):
for chunk, score in zip(result.chunks, result.scores, strict=False): for chunk, score in zip(result.chunks, result.scores, strict=False):
if not hasattr(chunk, "metadata") or chunk.metadata is None: if not hasattr(chunk, "metadata") or chunk.metadata is None:
chunk.metadata = {} chunk.metadata = {}
chunk.metadata["vector_db_id"] = vector_db_id chunk.metadata["vector_store_id"] = vector_store_id
chunks.append(chunk) chunks.append(chunk)
scores.append(score) scores.append(score)
@ -250,7 +250,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
metadata_keys_to_exclude_from_context = [ metadata_keys_to_exclude_from_context = [
"token_count", "token_count",
"metadata_token_count", "metadata_token_count",
"vector_db_id", "vector_store_id",
] ]
metadata_for_context = {} metadata_for_context = {}
for k in chunk_metadata_keys_to_include_from_context: for k in chunk_metadata_keys_to_include_from_context:
@ -275,7 +275,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
"document_ids": [c.document_id for c in chunks[: len(picked)]], "document_ids": [c.document_id for c in chunks[: len(picked)]],
"chunks": [c.content for c in chunks[: len(picked)]], "chunks": [c.content for c in chunks[: len(picked)]],
"scores": scores[: len(picked)], "scores": scores[: len(picked)],
"vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]], "vector_store_ids": [c.metadata["vector_store_id"] for c in chunks[: len(picked)]],
}, },
) )
@ -309,7 +309,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
) )
async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult: async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
vector_db_ids = kwargs.get("vector_db_ids", []) vector_store_ids = kwargs.get("vector_store_ids", [])
query_config = kwargs.get("query_config") query_config = kwargs.get("query_config")
if query_config: if query_config:
query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config) query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config)
@ -319,7 +319,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
query = kwargs["query"] query = kwargs["query"]
result = await self.query( result = await self.query(
content=query, content=query,
vector_db_ids=vector_db_ids, vector_store_ids=vector_store_ids,
query_config=query_config, query_config=query_config,
) )

View file

@ -248,19 +248,19 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoco
del self.cache[vector_store_id] del self.cache[vector_store_id]
await self.kvstore.delete(f"{VECTOR_DBS_PREFIX}{vector_store_id}") await self.kvstore.delete(f"{VECTOR_DBS_PREFIX}{vector_store_id}")
async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
index = self.cache.get(vector_db_id) index = self.cache.get(vector_store_id)
if index is None: if index is None:
raise ValueError(f"Vector DB {vector_db_id} not found. found: {self.cache.keys()}") raise ValueError(f"Vector DB {vector_store_id} not found. found: {self.cache.keys()}")
await index.insert_chunks(chunks) await index.insert_chunks(chunks)
async def query_chunks( async def query_chunks(
self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
) -> QueryChunksResponse: ) -> QueryChunksResponse:
index = self.cache.get(vector_db_id) index = self.cache.get(vector_store_id)
if index is None: if index is None:
raise VectorStoreNotFoundError(vector_db_id) raise VectorStoreNotFoundError(vector_store_id)
return await index.query_chunks(query, params) return await index.query_chunks(query, params)

View file

@ -447,20 +447,20 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresPro
await self.cache[vector_store_id].index.delete() await self.cache[vector_store_id].index.delete()
del self.cache[vector_store_id] del self.cache[vector_store_id]
async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
if not index: if not index:
raise VectorStoreNotFoundError(vector_db_id) raise VectorStoreNotFoundError(vector_store_id)
# The VectorStoreWithIndex helper is expected to compute embeddings via the inference_api # The VectorStoreWithIndex helper is expected to compute embeddings via the inference_api
# and then call our index's add_chunks. # and then call our index's add_chunks.
await index.insert_chunks(chunks) await index.insert_chunks(chunks)
async def query_chunks( async def query_chunks(
self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None self, vector_store_id: str, query: Any, params: dict[str, Any] | None = None
) -> QueryChunksResponse: ) -> QueryChunksResponse:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
if not index: if not index:
raise VectorStoreNotFoundError(vector_db_id) raise VectorStoreNotFoundError(vector_store_id)
return await index.query_chunks(query, params) return await index.query_chunks(query, params)
async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:

View file

@ -163,14 +163,14 @@ The SQLite-vec provider supports three search modes:
Example with hybrid search: Example with hybrid search:
```python ```python
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7}, params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
) )
# Using RRF ranker # Using RRF ranker
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={ params={
"mode": "hybrid", "mode": "hybrid",
@ -182,7 +182,7 @@ response = await vector_io.query_chunks(
# Using weighted ranker # Using weighted ranker
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={ params={
"mode": "hybrid", "mode": "hybrid",
@ -196,7 +196,7 @@ response = await vector_io.query_chunks(
Example with explicit vector search: Example with explicit vector search:
```python ```python
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7}, params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
) )
@ -205,7 +205,7 @@ response = await vector_io.query_chunks(
Example with keyword search: Example with keyword search:
```python ```python
response = await vector_io.query_chunks( response = await vector_io.query_chunks(
vector_db_id="my_db", vector_store_id="my_db",
query="your query here", query="your query here",
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7}, params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
) )

View file

@ -169,20 +169,20 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
await self.cache[vector_store_id].index.delete() await self.cache[vector_store_id].index.delete()
del self.cache[vector_store_id] del self.cache[vector_store_id]
async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
if index is None: if index is None:
raise ValueError(f"Vector DB {vector_db_id} not found in Chroma") raise ValueError(f"Vector DB {vector_store_id} not found in Chroma")
await index.insert_chunks(chunks) await index.insert_chunks(chunks)
async def query_chunks( async def query_chunks(
self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
) -> QueryChunksResponse: ) -> QueryChunksResponse:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
if index is None: if index is None:
raise ValueError(f"Vector DB {vector_db_id} not found in Chroma") raise ValueError(f"Vector DB {vector_store_id} not found in Chroma")
return await index.query_chunks(query, params) return await index.query_chunks(query, params)

View file

@ -348,19 +348,19 @@ class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
await self.cache[vector_store_id].index.delete() await self.cache[vector_store_id].index.delete()
del self.cache[vector_store_id] del self.cache[vector_store_id]
async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
if not index: if not index:
raise VectorStoreNotFoundError(vector_db_id) raise VectorStoreNotFoundError(vector_store_id)
await index.insert_chunks(chunks) await index.insert_chunks(chunks)
async def query_chunks( async def query_chunks(
self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
) -> QueryChunksResponse: ) -> QueryChunksResponse:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
if not index: if not index:
raise VectorStoreNotFoundError(vector_db_id) raise VectorStoreNotFoundError(vector_store_id)
return await index.query_chunks(query, params) return await index.query_chunks(query, params)
async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:

View file

@ -399,14 +399,14 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
assert self.kvstore is not None assert self.kvstore is not None
await self.kvstore.delete(key=f"{VECTOR_DBS_PREFIX}{vector_store_id}") await self.kvstore.delete(key=f"{VECTOR_DBS_PREFIX}{vector_store_id}")
async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
await index.insert_chunks(chunks) await index.insert_chunks(chunks)
async def query_chunks( async def query_chunks(
self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
) -> QueryChunksResponse: ) -> QueryChunksResponse:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
return await index.query_chunks(query, params) return await index.query_chunks(query, params)
async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex: async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex:

View file

@ -222,19 +222,19 @@ class QdrantVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc
self.cache[vector_store_id] = index self.cache[vector_store_id] = index
return index return index
async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
if not index: if not index:
raise VectorStoreNotFoundError(vector_db_id) raise VectorStoreNotFoundError(vector_store_id)
await index.insert_chunks(chunks) await index.insert_chunks(chunks)
async def query_chunks( async def query_chunks(
self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
) -> QueryChunksResponse: ) -> QueryChunksResponse:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
if not index: if not index:
raise VectorStoreNotFoundError(vector_db_id) raise VectorStoreNotFoundError(vector_store_id)
return await index.query_chunks(query, params) return await index.query_chunks(query, params)

View file

@ -366,19 +366,19 @@ class WeaviateVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, NeedsRequestProv
self.cache[vector_store_id] = index self.cache[vector_store_id] = index
return index return index
async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
if not index: if not index:
raise VectorStoreNotFoundError(vector_db_id) raise VectorStoreNotFoundError(vector_store_id)
await index.insert_chunks(chunks) await index.insert_chunks(chunks)
async def query_chunks( async def query_chunks(
self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
) -> QueryChunksResponse: ) -> QueryChunksResponse:
index = await self._get_and_cache_vector_store_index(vector_db_id) index = await self._get_and_cache_vector_store_index(vector_store_id)
if not index: if not index:
raise VectorStoreNotFoundError(vector_db_id) raise VectorStoreNotFoundError(vector_store_id)
return await index.query_chunks(query, params) return await index.query_chunks(query, params)

View file

@ -333,7 +333,7 @@ class OpenAIVectorStoreMixin(ABC):
@abstractmethod @abstractmethod
async def insert_chunks( async def insert_chunks(
self, self,
vector_db_id: str, vector_store_id: str,
chunks: list[Chunk], chunks: list[Chunk],
ttl_seconds: int | None = None, ttl_seconds: int | None = None,
) -> None: ) -> None:
@ -342,7 +342,7 @@ class OpenAIVectorStoreMixin(ABC):
@abstractmethod @abstractmethod
async def query_chunks( async def query_chunks(
self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None self, vector_store_id: str, query: Any, params: dict[str, Any] | None = None
) -> QueryChunksResponse: ) -> QueryChunksResponse:
"""Query chunks from a vector database (provider-specific implementation).""" """Query chunks from a vector database (provider-specific implementation)."""
pass pass
@ -609,7 +609,7 @@ class OpenAIVectorStoreMixin(ABC):
# TODO: Add support for ranking_options.ranker # TODO: Add support for ranking_options.ranker
response = await self.query_chunks( response = await self.query_chunks(
vector_db_id=vector_store_id, vector_store_id=vector_store_id,
query=search_query, query=search_query,
params=params, params=params,
) )
@ -803,7 +803,7 @@ class OpenAIVectorStoreMixin(ABC):
) )
else: else:
await self.insert_chunks( await self.insert_chunks(
vector_db_id=vector_store_id, vector_store_id=vector_store_id,
chunks=chunks, chunks=chunks,
) )
vector_store_file_object.status = "completed" vector_store_file_object.status = "completed"

View file

@ -367,7 +367,7 @@ def test_openai_vector_store_with_chunks(
# Insert chunks using the native LlamaStack API (since OpenAI API doesn't have direct chunk insertion) # Insert chunks using the native LlamaStack API (since OpenAI API doesn't have direct chunk insertion)
llama_client.vector_io.insert( llama_client.vector_io.insert(
vector_db_id=vector_store.id, vector_store_id=vector_store.id,
chunks=sample_chunks, chunks=sample_chunks,
) )
@ -434,7 +434,7 @@ def test_openai_vector_store_search_relevance(
# Insert chunks using native API # Insert chunks using native API
llama_client.vector_io.insert( llama_client.vector_io.insert(
vector_db_id=vector_store.id, vector_store_id=vector_store.id,
chunks=sample_chunks, chunks=sample_chunks,
) )
@ -484,7 +484,7 @@ def test_openai_vector_store_search_with_ranking_options(
# Insert chunks # Insert chunks
llama_client.vector_io.insert( llama_client.vector_io.insert(
vector_db_id=vector_store.id, vector_store_id=vector_store.id,
chunks=sample_chunks, chunks=sample_chunks,
) )
@ -544,7 +544,7 @@ def test_openai_vector_store_search_with_high_score_filter(
# Insert chunks # Insert chunks
llama_client.vector_io.insert( llama_client.vector_io.insert(
vector_db_id=vector_store.id, vector_store_id=vector_store.id,
chunks=sample_chunks, chunks=sample_chunks,
) )
@ -610,7 +610,7 @@ def test_openai_vector_store_search_with_max_num_results(
# Insert chunks # Insert chunks
llama_client.vector_io.insert( llama_client.vector_io.insert(
vector_db_id=vector_store.id, vector_store_id=vector_store.id,
chunks=sample_chunks, chunks=sample_chunks,
) )
@ -1175,7 +1175,7 @@ def test_openai_vector_store_search_modes(
) )
client_with_models.vector_io.insert( client_with_models.vector_io.insert(
vector_db_id=vector_store.id, vector_store_id=vector_store.id,
chunks=sample_chunks, chunks=sample_chunks,
) )
query = "Python programming language" query = "Python programming language"

View file

@ -123,12 +123,12 @@ def test_insert_chunks(
actual_vector_store_id = create_response.id actual_vector_store_id = create_response.id
client_with_empty_registry.vector_io.insert( client_with_empty_registry.vector_io.insert(
vector_db_id=actual_vector_store_id, vector_store_id=actual_vector_store_id,
chunks=sample_chunks, chunks=sample_chunks,
) )
response = client_with_empty_registry.vector_io.query( response = client_with_empty_registry.vector_io.query(
vector_db_id=actual_vector_store_id, vector_store_id=actual_vector_store_id,
query="What is the capital of France?", query="What is the capital of France?",
) )
assert response is not None assert response is not None
@ -137,7 +137,7 @@ def test_insert_chunks(
query, expected_doc_id = test_case query, expected_doc_id = test_case
response = client_with_empty_registry.vector_io.query( response = client_with_empty_registry.vector_io.query(
vector_db_id=actual_vector_store_id, vector_store_id=actual_vector_store_id,
query=query, query=query,
) )
assert response is not None assert response is not None
@ -174,13 +174,13 @@ def test_insert_chunks_with_precomputed_embeddings(
] ]
client_with_empty_registry.vector_io.insert( client_with_empty_registry.vector_io.insert(
vector_db_id=actual_vector_store_id, vector_store_id=actual_vector_store_id,
chunks=chunks_with_embeddings, chunks=chunks_with_embeddings,
) )
provider = [p.provider_id for p in client_with_empty_registry.providers.list() if p.api == "vector_io"][0] provider = [p.provider_id for p in client_with_empty_registry.providers.list() if p.api == "vector_io"][0]
response = client_with_empty_registry.vector_io.query( response = client_with_empty_registry.vector_io.query(
vector_db_id=actual_vector_store_id, vector_store_id=actual_vector_store_id,
query="precomputed embedding test", query="precomputed embedding test",
params=vector_io_provider_params_dict.get(provider, None), params=vector_io_provider_params_dict.get(provider, None),
) )
@ -224,13 +224,13 @@ def test_query_returns_valid_object_when_identical_to_embedding_in_vdb(
] ]
client_with_empty_registry.vector_io.insert( client_with_empty_registry.vector_io.insert(
vector_db_id=actual_vector_store_id, vector_store_id=actual_vector_store_id,
chunks=chunks_with_embeddings, chunks=chunks_with_embeddings,
) )
provider = [p.provider_id for p in client_with_empty_registry.providers.list() if p.api == "vector_io"][0] provider = [p.provider_id for p in client_with_empty_registry.providers.list() if p.api == "vector_io"][0]
response = client_with_empty_registry.vector_io.query( response = client_with_empty_registry.vector_io.query(
vector_db_id=actual_vector_store_id, vector_store_id=actual_vector_store_id,
query="duplicate", query="duplicate",
params=vector_io_provider_params_dict.get(provider, None), params=vector_io_provider_params_dict.get(provider, None),
) )

View file

@ -23,14 +23,14 @@ class TestRagQuery:
config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock(), files_api=MagicMock() config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock(), files_api=MagicMock()
) )
with pytest.raises(ValueError): with pytest.raises(ValueError):
await rag_tool.query(content=MagicMock(), vector_db_ids=[]) await rag_tool.query(content=MagicMock(), vector_store_ids=[])
async def test_query_chunk_metadata_handling(self): async def test_query_chunk_metadata_handling(self):
rag_tool = MemoryToolRuntimeImpl( rag_tool = MemoryToolRuntimeImpl(
config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock(), files_api=MagicMock() config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock(), files_api=MagicMock()
) )
content = "test query content" content = "test query content"
vector_db_ids = ["db1"] vector_store_ids = ["db1"]
chunk_metadata = ChunkMetadata( chunk_metadata = ChunkMetadata(
document_id="doc1", document_id="doc1",
@ -55,7 +55,7 @@ class TestRagQuery:
query_response = QueryChunksResponse(chunks=[chunk], scores=[1.0]) query_response = QueryChunksResponse(chunks=[chunk], scores=[1.0])
rag_tool.vector_io_api.query_chunks = AsyncMock(return_value=query_response) rag_tool.vector_io_api.query_chunks = AsyncMock(return_value=query_response)
result = await rag_tool.query(content=content, vector_db_ids=vector_db_ids) result = await rag_tool.query(content=content, vector_store_ids=vector_store_ids)
assert result is not None assert result is not None
expected_metadata_string = ( expected_metadata_string = (
@ -90,7 +90,7 @@ class TestRagQuery:
files_api=MagicMock(), files_api=MagicMock(),
) )
vector_db_ids = ["db1", "db2"] vector_store_ids = ["db1", "db2"]
# Fake chunks from each DB # Fake chunks from each DB
chunk_metadata1 = ChunkMetadata( chunk_metadata1 = ChunkMetadata(
@ -101,7 +101,7 @@ class TestRagQuery:
) )
chunk1 = Chunk( chunk1 = Chunk(
content="chunk from db1", content="chunk from db1",
metadata={"vector_db_id": "db1", "document_id": "doc1"}, metadata={"vector_store_id": "db1", "document_id": "doc1"},
stored_chunk_id="c1", stored_chunk_id="c1",
chunk_metadata=chunk_metadata1, chunk_metadata=chunk_metadata1,
) )
@ -114,7 +114,7 @@ class TestRagQuery:
) )
chunk2 = Chunk( chunk2 = Chunk(
content="chunk from db2", content="chunk from db2",
metadata={"vector_db_id": "db2", "document_id": "doc2"}, metadata={"vector_store_id": "db2", "document_id": "doc2"},
stored_chunk_id="c2", stored_chunk_id="c2",
chunk_metadata=chunk_metadata2, chunk_metadata=chunk_metadata2,
) )
@ -126,13 +126,13 @@ class TestRagQuery:
] ]
) )
result = await rag_tool.query(content="test", vector_db_ids=vector_db_ids) result = await rag_tool.query(content="test", vector_store_ids=vector_store_ids)
returned_chunks = result.metadata["chunks"] returned_chunks = result.metadata["chunks"]
returned_scores = result.metadata["scores"] returned_scores = result.metadata["scores"]
returned_doc_ids = result.metadata["document_ids"] returned_doc_ids = result.metadata["document_ids"]
returned_vector_db_ids = result.metadata["vector_db_ids"] returned_vector_store_ids = result.metadata["vector_store_ids"]
assert returned_chunks == ["chunk from db1", "chunk from db2"] assert returned_chunks == ["chunk from db1", "chunk from db2"]
assert returned_scores == (0.9, 0.8) assert returned_scores == (0.9, 0.8)
assert returned_doc_ids == ["doc1", "doc2"] assert returned_doc_ids == ["doc1", "doc2"]
assert returned_vector_db_ids == ["db1", "db2"] assert returned_vector_store_ids == ["db1", "db2"]