diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml index 7b03cd03e..85c7186af 100644 --- a/client-sdks/stainless/openapi.yml +++ b/client-sdks/stainless/openapi.yml @@ -9862,7 +9862,7 @@ components: $ref: '#/components/schemas/RAGDocument' description: >- List of documents to index in the RAG system - vector_db_id: + vector_store_id: type: string description: >- ID of the vector database to store the document embeddings @@ -9873,7 +9873,7 @@ components: additionalProperties: false required: - documents - - vector_db_id + - vector_store_id - chunk_size_in_tokens title: InsertRequest DefaultRAGQueryGeneratorConfig: @@ -10044,7 +10044,7 @@ components: $ref: '#/components/schemas/InterleavedContent' description: >- The query content to search for in the indexed documents - vector_db_ids: + vector_store_ids: type: array items: type: string @@ -10057,7 +10057,7 @@ components: additionalProperties: false required: - content - - vector_db_ids + - vector_store_ids title: QueryRequest RAGQueryResult: type: object @@ -10281,7 +10281,7 @@ components: InsertChunksRequest: type: object properties: - vector_db_id: + vector_store_id: type: string description: >- The identifier of the vector database to insert the chunks into. @@ -10300,13 +10300,13 @@ components: description: The time to live of the chunks. additionalProperties: false required: - - vector_db_id + - vector_store_id - chunks title: InsertChunksRequest QueryChunksRequest: type: object properties: - vector_db_id: + vector_store_id: type: string description: >- The identifier of the vector database to query. @@ -10326,7 +10326,7 @@ components: description: The parameters of the query. additionalProperties: false required: - - vector_db_id + - vector_store_id - query title: QueryChunksRequest QueryChunksResponse: @@ -11844,7 +11844,7 @@ components: description: Type of the step in an agent turn. const: memory_retrieval default: memory_retrieval - vector_db_ids: + vector_store_ids: type: string description: >- The IDs of the vector databases to retrieve context from. @@ -11857,7 +11857,7 @@ components: - turn_id - step_id - step_type - - vector_db_ids + - vector_store_ids - inserted_context title: MemoryRetrievalStep description: >- diff --git a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx index 98a372250..bfa2f29de 100644 --- a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx +++ b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx @@ -72,14 +72,14 @@ description: | Example with hybrid search: ```python response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7}, ) # Using RRF ranker response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={ "mode": "hybrid", @@ -91,7 +91,7 @@ description: | # Using weighted ranker response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={ "mode": "hybrid", @@ -105,7 +105,7 @@ description: | Example with explicit vector search: ```python response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7}, ) @@ -114,7 +114,7 @@ description: | Example with keyword search: ```python response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7}, ) @@ -277,14 +277,14 @@ The SQLite-vec provider supports three search modes: Example with hybrid search: ```python response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7}, ) # Using RRF ranker response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={ "mode": "hybrid", @@ -296,7 +296,7 @@ response = await vector_io.query_chunks( # Using weighted ranker response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={ "mode": "hybrid", @@ -310,7 +310,7 @@ response = await vector_io.query_chunks( Example with explicit vector search: ```python response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7}, ) @@ -319,7 +319,7 @@ response = await vector_io.query_chunks( Example with keyword search: ```python response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7}, ) diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html index 4ae6add60..e06943cf6 100644 --- a/docs/static/deprecated-llama-stack-spec.html +++ b/docs/static/deprecated-llama-stack-spec.html @@ -4390,7 +4390,7 @@ "const": "memory_retrieval", "default": "memory_retrieval" }, - "vector_db_ids": { + "vector_store_ids": { "type": "string", "description": "The IDs of the vector databases to retrieve context from." }, @@ -4404,7 +4404,7 @@ "turn_id", "step_id", "step_type", - "vector_db_ids", + "vector_store_ids", "inserted_context" ], "title": "MemoryRetrievalStep", diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 3bcfde02e..6635b58cf 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -3252,7 +3252,7 @@ components: description: Type of the step in an agent turn. const: memory_retrieval default: memory_retrieval - vector_db_ids: + vector_store_ids: type: string description: >- The IDs of the vector databases to retrieve context from. @@ -3265,7 +3265,7 @@ components: - turn_id - step_id - step_type - - vector_db_ids + - vector_store_ids - inserted_context title: MemoryRetrievalStep description: >- diff --git a/docs/static/experimental-llama-stack-spec.html b/docs/static/experimental-llama-stack-spec.html index 2ad81d4f2..22473ec11 100644 --- a/docs/static/experimental-llama-stack-spec.html +++ b/docs/static/experimental-llama-stack-spec.html @@ -2865,7 +2865,7 @@ "const": "memory_retrieval", "default": "memory_retrieval" }, - "vector_db_ids": { + "vector_store_ids": { "type": "string", "description": "The IDs of the vector databases to retrieve context from." }, @@ -2879,7 +2879,7 @@ "turn_id", "step_id", "step_type", - "vector_db_ids", + "vector_store_ids", "inserted_context" ], "title": "MemoryRetrievalStep", diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml index f15add8cf..0a52bc89b 100644 --- a/docs/static/experimental-llama-stack-spec.yaml +++ b/docs/static/experimental-llama-stack-spec.yaml @@ -2085,7 +2085,7 @@ components: description: Type of the step in an agent turn. const: memory_retrieval default: memory_retrieval - vector_db_ids: + vector_store_ids: type: string description: >- The IDs of the vector databases to retrieve context from. @@ -2098,7 +2098,7 @@ components: - turn_id - step_id - step_type - - vector_db_ids + - vector_store_ids - inserted_context title: MemoryRetrievalStep description: >- diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index 5d8b62db3..d70afb2d3 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -11412,7 +11412,7 @@ }, "description": "List of documents to index in the RAG system" }, - "vector_db_id": { + "vector_store_id": { "type": "string", "description": "ID of the vector database to store the document embeddings" }, @@ -11424,7 +11424,7 @@ "additionalProperties": false, "required": [ "documents", - "vector_db_id", + "vector_store_id", "chunk_size_in_tokens" ], "title": "InsertRequest" @@ -11615,7 +11615,7 @@ "$ref": "#/components/schemas/InterleavedContent", "description": "The query content to search for in the indexed documents" }, - "vector_db_ids": { + "vector_store_ids": { "type": "array", "items": { "type": "string" @@ -11630,7 +11630,7 @@ "additionalProperties": false, "required": [ "content", - "vector_db_ids" + "vector_store_ids" ], "title": "QueryRequest" }, @@ -11923,7 +11923,7 @@ "InsertChunksRequest": { "type": "object", "properties": { - "vector_db_id": { + "vector_store_id": { "type": "string", "description": "The identifier of the vector database to insert the chunks into." }, @@ -11941,7 +11941,7 @@ }, "additionalProperties": false, "required": [ - "vector_db_id", + "vector_store_id", "chunks" ], "title": "InsertChunksRequest" @@ -11949,7 +11949,7 @@ "QueryChunksRequest": { "type": "object", "properties": { - "vector_db_id": { + "vector_store_id": { "type": "string", "description": "The identifier of the vector database to query." }, @@ -11986,7 +11986,7 @@ }, "additionalProperties": false, "required": [ - "vector_db_id", + "vector_store_id", "query" ], "title": "QueryChunksRequest" diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 435520356..78e56df28 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -8649,7 +8649,7 @@ components: $ref: '#/components/schemas/RAGDocument' description: >- List of documents to index in the RAG system - vector_db_id: + vector_store_id: type: string description: >- ID of the vector database to store the document embeddings @@ -8660,7 +8660,7 @@ components: additionalProperties: false required: - documents - - vector_db_id + - vector_store_id - chunk_size_in_tokens title: InsertRequest DefaultRAGQueryGeneratorConfig: @@ -8831,7 +8831,7 @@ components: $ref: '#/components/schemas/InterleavedContent' description: >- The query content to search for in the indexed documents - vector_db_ids: + vector_store_ids: type: array items: type: string @@ -8844,7 +8844,7 @@ components: additionalProperties: false required: - content - - vector_db_ids + - vector_store_ids title: QueryRequest RAGQueryResult: type: object @@ -9068,7 +9068,7 @@ components: InsertChunksRequest: type: object properties: - vector_db_id: + vector_store_id: type: string description: >- The identifier of the vector database to insert the chunks into. @@ -9087,13 +9087,13 @@ components: description: The time to live of the chunks. additionalProperties: false required: - - vector_db_id + - vector_store_id - chunks title: InsertChunksRequest QueryChunksRequest: type: object properties: - vector_db_id: + vector_store_id: type: string description: >- The identifier of the vector database to query. @@ -9113,7 +9113,7 @@ components: description: The parameters of the query. additionalProperties: false required: - - vector_db_id + - vector_store_id - query title: QueryChunksRequest QueryChunksResponse: diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index 2616a9917..dcd44ec6e 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -13084,7 +13084,7 @@ }, "description": "List of documents to index in the RAG system" }, - "vector_db_id": { + "vector_store_id": { "type": "string", "description": "ID of the vector database to store the document embeddings" }, @@ -13096,7 +13096,7 @@ "additionalProperties": false, "required": [ "documents", - "vector_db_id", + "vector_store_id", "chunk_size_in_tokens" ], "title": "InsertRequest" @@ -13287,7 +13287,7 @@ "$ref": "#/components/schemas/InterleavedContent", "description": "The query content to search for in the indexed documents" }, - "vector_db_ids": { + "vector_store_ids": { "type": "array", "items": { "type": "string" @@ -13302,7 +13302,7 @@ "additionalProperties": false, "required": [ "content", - "vector_db_ids" + "vector_store_ids" ], "title": "QueryRequest" }, @@ -13595,7 +13595,7 @@ "InsertChunksRequest": { "type": "object", "properties": { - "vector_db_id": { + "vector_store_id": { "type": "string", "description": "The identifier of the vector database to insert the chunks into." }, @@ -13613,7 +13613,7 @@ }, "additionalProperties": false, "required": [ - "vector_db_id", + "vector_store_id", "chunks" ], "title": "InsertChunksRequest" @@ -13621,7 +13621,7 @@ "QueryChunksRequest": { "type": "object", "properties": { - "vector_db_id": { + "vector_store_id": { "type": "string", "description": "The identifier of the vector database to query." }, @@ -13658,7 +13658,7 @@ }, "additionalProperties": false, "required": [ - "vector_db_id", + "vector_store_id", "query" ], "title": "QueryChunksRequest" @@ -15719,7 +15719,7 @@ "const": "memory_retrieval", "default": "memory_retrieval" }, - "vector_db_ids": { + "vector_store_ids": { "type": "string", "description": "The IDs of the vector databases to retrieve context from." }, @@ -15733,7 +15733,7 @@ "turn_id", "step_id", "step_type", - "vector_db_ids", + "vector_store_ids", "inserted_context" ], "title": "MemoryRetrievalStep", diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 7b03cd03e..85c7186af 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -9862,7 +9862,7 @@ components: $ref: '#/components/schemas/RAGDocument' description: >- List of documents to index in the RAG system - vector_db_id: + vector_store_id: type: string description: >- ID of the vector database to store the document embeddings @@ -9873,7 +9873,7 @@ components: additionalProperties: false required: - documents - - vector_db_id + - vector_store_id - chunk_size_in_tokens title: InsertRequest DefaultRAGQueryGeneratorConfig: @@ -10044,7 +10044,7 @@ components: $ref: '#/components/schemas/InterleavedContent' description: >- The query content to search for in the indexed documents - vector_db_ids: + vector_store_ids: type: array items: type: string @@ -10057,7 +10057,7 @@ components: additionalProperties: false required: - content - - vector_db_ids + - vector_store_ids title: QueryRequest RAGQueryResult: type: object @@ -10281,7 +10281,7 @@ components: InsertChunksRequest: type: object properties: - vector_db_id: + vector_store_id: type: string description: >- The identifier of the vector database to insert the chunks into. @@ -10300,13 +10300,13 @@ components: description: The time to live of the chunks. additionalProperties: false required: - - vector_db_id + - vector_store_id - chunks title: InsertChunksRequest QueryChunksRequest: type: object properties: - vector_db_id: + vector_store_id: type: string description: >- The identifier of the vector database to query. @@ -10326,7 +10326,7 @@ components: description: The parameters of the query. additionalProperties: false required: - - vector_db_id + - vector_store_id - query title: QueryChunksRequest QueryChunksResponse: @@ -11844,7 +11844,7 @@ components: description: Type of the step in an agent turn. const: memory_retrieval default: memory_retrieval - vector_db_ids: + vector_store_ids: type: string description: >- The IDs of the vector databases to retrieve context from. @@ -11857,7 +11857,7 @@ components: - turn_id - step_id - step_type - - vector_db_ids + - vector_store_ids - inserted_context title: MemoryRetrievalStep description: >- diff --git a/scripts/install.sh b/scripts/install.sh index 2417097f4..5e4939767 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -30,8 +30,10 @@ materialize_telemetry_configs() { local otel_cfg="${dest}/otel-collector-config.yaml" local prom_cfg="${dest}/prometheus.yml" local graf_cfg="${dest}/grafana-datasources.yaml" + local graf_dash_cfg="${dest}/grafana-dashboards.yaml" + local dash_json="${dest}/llama-stack-dashboard.json" - for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg"; do + for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg" "$graf_dash_cfg" "$dash_json"; do if [ -e "$asset" ]; then die "Telemetry asset ${asset} already exists; refusing to overwrite" fi @@ -103,6 +105,7 @@ datasources: type: prometheus access: proxy url: http://prometheus:9090 + uid: prometheus isDefault: true editable: true @@ -112,6 +115,224 @@ datasources: url: http://jaeger:16686 editable: true EOF + + cat <<'EOF' > "$graf_dash_cfg" +apiVersion: 1 + +providers: + - name: 'Llama Stack' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards +EOF + + # Copy the dashboard JSON inline to avoid line-length issues + cat > "$dash_json" <<'DASHBOARD_JSON' +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "showPoints": "auto", + "fillOpacity": 10 + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{"color": "green", "value": null}] + } + } + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "id": 1, + "options": { + "legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "expr": "llama_stack_completion_tokens_total", + "legendFormat": "{{model_id}} ({{provider_id}})", + "refId": "A" + } + ], + "title": "Completion Tokens", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]} + } + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "id": 2, + "options": { + "legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "targets": [ + {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_prompt_tokens_total", "legendFormat": "Prompt - {{model_id}}", "refId": "A"}, + {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_tokens_total", "legendFormat": "Total - {{model_id}}", "refId": "B"} + ], + "title": "Prompt & Total Tokens", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "ms" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "id": 3, + "options": { + "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "targets": [ + {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p95", "refId": "A"}, + {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p99", "refId": "B"} + ], + "title": "HTTP Request Duration (p95, p99)", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]} + } + }, + "gridPos": {"h": 8, "w": 6, "x": 12, "y": 8}, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "textMode": "auto" + }, + "targets": [ + {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_duration_milliseconds_count)", "refId": "A"} + ], + "title": "Total Requests", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]} + } + }, + "gridPos": {"h": 8, "w": 6, "x": 18, "y": 8}, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "textMode": "auto" + }, + "targets": [ + {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_active_requests)", "refId": "A"} + ], + "title": "Active Requests", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "reqps" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "id": 6, + "options": { + "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "targets": [ + {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])", "legendFormat": "{{http_target}} - {{http_status_code}}", "refId": "A"} + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "Bps" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "id": 7, + "options": { + "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "none"} + }, + "targets": [ + {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])", "legendFormat": "Request", "refId": "A"}, + {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])", "legendFormat": "Response", "refId": "B"} + ], + "title": "Request/Response Sizes", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "tags": ["llama-stack"], + "templating": {"list": []}, + "time": {"from": "now-15m", "to": "now"}, + "timepicker": {}, + "timezone": "browser", + "title": "Llama Stack Metrics", + "uid": "llama-stack-metrics", + "version": 0, + "weekStart": "" +} +DASHBOARD_JSON } # Cleanup function to remove temporary files @@ -372,6 +593,8 @@ if [ "$WITH_TELEMETRY" = true ]; then -e GF_SECURITY_ADMIN_PASSWORD=admin \ -e GF_USERS_ALLOW_SIGN_UP=false \ -v "${TELEMETRY_ASSETS_DIR}/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \ + -v "${TELEMETRY_ASSETS_DIR}/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \ + -v "${TELEMETRY_ASSETS_DIR}/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \ docker.io/grafana/grafana:11.0.0 > /dev/null 2>&1; then die "Grafana startup failed" fi diff --git a/scripts/telemetry/grafana-dashboards.yaml b/scripts/telemetry/grafana-dashboards.yaml new file mode 100644 index 000000000..f063fa518 --- /dev/null +++ b/scripts/telemetry/grafana-dashboards.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'Llama Stack' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards diff --git a/scripts/telemetry/grafana-datasources.yaml b/scripts/telemetry/grafana-datasources.yaml index d01fe04ce..0634ac687 100644 --- a/scripts/telemetry/grafana-datasources.yaml +++ b/scripts/telemetry/grafana-datasources.yaml @@ -5,6 +5,7 @@ datasources: type: prometheus access: proxy url: http://prometheus:9090 + uid: prometheus isDefault: true editable: true diff --git a/scripts/telemetry/llama-stack-dashboard.json b/scripts/telemetry/llama-stack-dashboard.json new file mode 100644 index 000000000..a9f8ac7a2 --- /dev/null +++ b/scripts/telemetry/llama-stack-dashboard.json @@ -0,0 +1,457 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "showPoints": "auto", + "fillOpacity": 10 + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "llama_stack_completion_tokens_total", + "legendFormat": "{{model_id}} ({{provider_id}})", + "refId": "A" + } + ], + "title": "Completion Tokens", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "showPoints": "auto", + "fillOpacity": 10 + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "llama_stack_prompt_tokens_total", + "legendFormat": "Prompt - {{model_id}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "llama_stack_tokens_total", + "legendFormat": "Total - {{model_id}}", + "refId": "B" + } + ], + "title": "Prompt & Total Tokens", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "showPoints": "auto", + "fillOpacity": 10 + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", + "legendFormat": "p99", + "refId": "B" + } + ], + "title": "HTTP Request Duration (p95, p99)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(llama_stack_http_server_duration_milliseconds_count)", + "refId": "A" + } + ], + "title": "Total Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + } + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(llama_stack_http_server_active_requests)", + "refId": "A" + } + ], + "title": "Active Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "showPoints": "auto", + "fillOpacity": 10 + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])", + "legendFormat": "{{http_target}} - {{http_status_code}}", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "showPoints": "auto", + "fillOpacity": 10 + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])", + "legendFormat": "Request", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])", + "legendFormat": "Response", + "refId": "B" + } + ], + "title": "Request/Response Sizes", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "tags": [ + "llama-stack" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Llama Stack Metrics", + "uid": "llama-stack-metrics", + "version": 0, + "weekStart": "" +} diff --git a/scripts/telemetry/setup_telemetry.sh b/scripts/telemetry/setup_telemetry.sh index ab855e8db..cbc052f92 100755 --- a/scripts/telemetry/setup_telemetry.sh +++ b/scripts/telemetry/setup_telemetry.sh @@ -135,6 +135,8 @@ $CONTAINER_RUNTIME run -d --name grafana \ -e GF_SECURITY_ADMIN_PASSWORD=admin \ -e GF_USERS_ALLOW_SIGN_UP=false \ -v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \ + -v "$SCRIPT_DIR/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \ + -v "$SCRIPT_DIR/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \ docker.io/grafana/grafana:11.0.0 # Wait for services to start diff --git a/src/llama_stack/apis/agents/agents.py b/src/llama_stack/apis/agents/agents.py index 6ad45cf99..9c3e9231b 100644 --- a/src/llama_stack/apis/agents/agents.py +++ b/src/llama_stack/apis/agents/agents.py @@ -149,13 +149,13 @@ class ShieldCallStep(StepCommon): class MemoryRetrievalStep(StepCommon): """A memory retrieval step in an agent turn. - :param vector_db_ids: The IDs of the vector databases to retrieve context from. + :param vector_store_ids: The IDs of the vector databases to retrieve context from. :param inserted_context: The context retrieved from the vector databases. """ step_type: Literal[StepType.memory_retrieval] = StepType.memory_retrieval # TODO: should this be List[str]? - vector_db_ids: str + vector_store_ids: str inserted_context: InterleavedContent diff --git a/src/llama_stack/apis/inference/inference.py b/src/llama_stack/apis/inference/inference.py index 8dc4dcf07..519fa0eb1 100644 --- a/src/llama_stack/apis/inference/inference.py +++ b/src/llama_stack/apis/inference/inference.py @@ -21,8 +21,8 @@ from typing_extensions import TypedDict from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent from llama_stack.apis.common.responses import Order from llama_stack.apis.models import Model -from llama_stack.apis.telemetry import MetricResponseMixin from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA +from llama_stack.core.telemetry.telemetry import MetricResponseMixin from llama_stack.core.telemetry.trace_protocol import trace_protocol from llama_stack.models.llama.datatypes import ( BuiltinTool, diff --git a/src/llama_stack/apis/tools/rag_tool.py b/src/llama_stack/apis/tools/rag_tool.py index c508721f1..4e43bb284 100644 --- a/src/llama_stack/apis/tools/rag_tool.py +++ b/src/llama_stack/apis/tools/rag_tool.py @@ -190,13 +190,13 @@ class RAGToolRuntime(Protocol): async def insert( self, documents: list[RAGDocument], - vector_db_id: str, + vector_store_id: str, chunk_size_in_tokens: int = 512, ) -> None: """Index documents so they can be used by the RAG system. :param documents: List of documents to index in the RAG system - :param vector_db_id: ID of the vector database to store the document embeddings + :param vector_store_id: ID of the vector database to store the document embeddings :param chunk_size_in_tokens: (Optional) Size in tokens for document chunking during indexing """ ... @@ -205,13 +205,13 @@ class RAGToolRuntime(Protocol): async def query( self, content: InterleavedContent, - vector_db_ids: list[str], + vector_store_ids: list[str], query_config: RAGQueryConfig | None = None, ) -> RAGQueryResult: """Query the RAG system for context; typically invoked by the agent. :param content: The query content to search for in the indexed documents - :param vector_db_ids: List of vector database IDs to search within + :param vector_store_ids: List of vector database IDs to search within :param query_config: (Optional) Configuration parameters for the query operation :returns: RAGQueryResult containing the retrieved content and metadata """ diff --git a/src/llama_stack/apis/vector_io/vector_io.py b/src/llama_stack/apis/vector_io/vector_io.py index 6e855ab99..19703e7bb 100644 --- a/src/llama_stack/apis/vector_io/vector_io.py +++ b/src/llama_stack/apis/vector_io/vector_io.py @@ -529,17 +529,17 @@ class VectorIO(Protocol): # this will just block now until chunks are inserted, but it should # probably return a Job instance which can be polled for completion - # TODO: rename vector_db_id to vector_store_id once Stainless is working + # TODO: rename vector_store_id to vector_store_id once Stainless is working @webmethod(route="/vector-io/insert", method="POST", level=LLAMA_STACK_API_V1) async def insert_chunks( self, - vector_db_id: str, + vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None, ) -> None: """Insert chunks into a vector database. - :param vector_db_id: The identifier of the vector database to insert the chunks into. + :param vector_store_id: The identifier of the vector database to insert the chunks into. :param chunks: The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types. `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional. If `metadata` is provided, you configure how Llama Stack formats the chunk during generation. @@ -548,17 +548,17 @@ class VectorIO(Protocol): """ ... - # TODO: rename vector_db_id to vector_store_id once Stainless is working + # TODO: rename vector_store_id to vector_store_id once Stainless is working @webmethod(route="/vector-io/query", method="POST", level=LLAMA_STACK_API_V1) async def query_chunks( self, - vector_db_id: str, + vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None, ) -> QueryChunksResponse: """Query chunks from a vector database. - :param vector_db_id: The identifier of the vector database to query. + :param vector_store_id: The identifier of the vector database to query. :param query: The query to search for. :param params: The parameters of the query. :returns: A QueryChunksResponse. diff --git a/src/llama_stack/core/conversations/conversations.py b/src/llama_stack/core/conversations/conversations.py index 83a49e848..951de5e9d 100644 --- a/src/llama_stack/core/conversations/conversations.py +++ b/src/llama_stack/core/conversations/conversations.py @@ -312,3 +312,6 @@ class ConversationServiceImpl(Conversations): logger.debug(f"Deleted item {item_id} from conversation {conversation_id}") return ConversationItemDeletedResource(id=item_id) + + async def shutdown(self) -> None: + pass diff --git a/src/llama_stack/core/prompts/prompts.py b/src/llama_stack/core/prompts/prompts.py index 1e48bcc8c..1a6f38cb5 100644 --- a/src/llama_stack/core/prompts/prompts.py +++ b/src/llama_stack/core/prompts/prompts.py @@ -230,3 +230,6 @@ class PromptServiceImpl(Prompts): await self.kvstore.set(default_key, str(version)) return self._deserialize_prompt(data) + + async def shutdown(self) -> None: + pass diff --git a/src/llama_stack/core/routers/inference.py b/src/llama_stack/core/routers/inference.py index d532bc622..dfd5e8e54 100644 --- a/src/llama_stack/core/routers/inference.py +++ b/src/llama_stack/core/routers/inference.py @@ -53,7 +53,7 @@ from llama_stack.apis.inference.inference import ( OpenAIChatCompletionContentPartTextParam, ) from llama_stack.apis.models import Model, ModelType -from llama_stack.apis.telemetry import MetricEvent, MetricInResponse +from llama_stack.core.telemetry.telemetry import MetricEvent, MetricInResponse from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span from llama_stack.log import get_logger from llama_stack.models.llama.llama3.chat_format import ChatFormat diff --git a/src/llama_stack/core/routers/vector_io.py b/src/llama_stack/core/routers/vector_io.py index 2b1701dc2..78b38ba95 100644 --- a/src/llama_stack/core/routers/vector_io.py +++ b/src/llama_stack/core/routers/vector_io.py @@ -73,27 +73,27 @@ class VectorIORouter(VectorIO): async def insert_chunks( self, - vector_db_id: str, + vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None, ) -> None: doc_ids = [chunk.document_id for chunk in chunks[:3]] logger.debug( - f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, " + f"VectorIORouter.insert_chunks: {vector_store_id}, {len(chunks)} chunks, " f"ttl_seconds={ttl_seconds}, chunk_ids={doc_ids}{' and more...' if len(chunks) > 3 else ''}" ) - provider = await self.routing_table.get_provider_impl(vector_db_id) - return await provider.insert_chunks(vector_db_id, chunks, ttl_seconds) + provider = await self.routing_table.get_provider_impl(vector_store_id) + return await provider.insert_chunks(vector_store_id, chunks, ttl_seconds) async def query_chunks( self, - vector_db_id: str, + vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None, ) -> QueryChunksResponse: - logger.debug(f"VectorIORouter.query_chunks: {vector_db_id}") - provider = await self.routing_table.get_provider_impl(vector_db_id) - return await provider.query_chunks(vector_db_id, query, params) + logger.debug(f"VectorIORouter.query_chunks: {vector_store_id}") + provider = await self.routing_table.get_provider_impl(vector_store_id) + return await provider.query_chunks(vector_store_id, query, params) # OpenAI Vector Stores API endpoints async def openai_create_vector_store( diff --git a/src/llama_stack/core/stack.py b/src/llama_stack/core/stack.py index 1b5c288a1..eccc562ae 100644 --- a/src/llama_stack/core/stack.py +++ b/src/llama_stack/core/stack.py @@ -31,7 +31,6 @@ from llama_stack.apis.scoring import Scoring from llama_stack.apis.scoring_functions import ScoringFunctions from llama_stack.apis.shields import Shields from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration -from llama_stack.apis.telemetry import Telemetry from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime from llama_stack.apis.vector_io import VectorIO from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl @@ -67,7 +66,6 @@ class LlamaStack( Safety, SyntheticDataGeneration, Datasets, - Telemetry, PostTraining, VectorIO, Eval, diff --git a/src/llama_stack/core/telemetry/telemetry.py b/src/llama_stack/core/telemetry/telemetry.py index f0cec08ec..dbd10e89c 100644 --- a/src/llama_stack/core/telemetry/telemetry.py +++ b/src/llama_stack/core/telemetry/telemetry.py @@ -6,7 +6,13 @@ import os import threading -from typing import Any +from datetime import datetime +from enum import Enum +from typing import ( + Annotated, + Any, + Literal, +) from opentelemetry import metrics, trace from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter @@ -16,21 +22,399 @@ from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator +from pydantic import BaseModel, Field -from llama_stack.apis.telemetry import ( - Event, - MetricEvent, - SpanEndPayload, - SpanStartPayload, - SpanStatus, - StructuredLogEvent, - UnstructuredLogEvent, -) -from llama_stack.apis.telemetry import ( - Telemetry as TelemetryBase, -) -from llama_stack.core.telemetry.tracing import ROOT_SPAN_MARKERS from llama_stack.log import get_logger +from llama_stack.models.llama.datatypes import Primitive +from llama_stack.schema_utils import json_schema_type, register_schema + +ROOT_SPAN_MARKERS = ["__root__", "__root_span__"] + + +@json_schema_type +class SpanStatus(Enum): + """The status of a span indicating whether it completed successfully or with an error. + :cvar OK: Span completed successfully without errors + :cvar ERROR: Span completed with an error or failure + """ + + OK = "ok" + ERROR = "error" + + +@json_schema_type +class Span(BaseModel): + """A span representing a single operation within a trace. + :param span_id: Unique identifier for the span + :param trace_id: Unique identifier for the trace this span belongs to + :param parent_span_id: (Optional) Unique identifier for the parent span, if this is a child span + :param name: Human-readable name describing the operation this span represents + :param start_time: Timestamp when the operation began + :param end_time: (Optional) Timestamp when the operation finished, if completed + :param attributes: (Optional) Key-value pairs containing additional metadata about the span + """ + + span_id: str + trace_id: str + parent_span_id: str | None = None + name: str + start_time: datetime + end_time: datetime | None = None + attributes: dict[str, Any] | None = Field(default_factory=lambda: {}) + + def set_attribute(self, key: str, value: Any): + if self.attributes is None: + self.attributes = {} + self.attributes[key] = value + + +@json_schema_type +class Trace(BaseModel): + """A trace representing the complete execution path of a request across multiple operations. + :param trace_id: Unique identifier for the trace + :param root_span_id: Unique identifier for the root span that started this trace + :param start_time: Timestamp when the trace began + :param end_time: (Optional) Timestamp when the trace finished, if completed + """ + + trace_id: str + root_span_id: str + start_time: datetime + end_time: datetime | None = None + + +@json_schema_type +class EventType(Enum): + """The type of telemetry event being logged. + :cvar UNSTRUCTURED_LOG: A simple log message with severity level + :cvar STRUCTURED_LOG: A structured log event with typed payload data + :cvar METRIC: A metric measurement with value and unit + """ + + UNSTRUCTURED_LOG = "unstructured_log" + STRUCTURED_LOG = "structured_log" + METRIC = "metric" + + +@json_schema_type +class LogSeverity(Enum): + """The severity level of a log message. + :cvar VERBOSE: Detailed diagnostic information for troubleshooting + :cvar DEBUG: Debug information useful during development + :cvar INFO: General informational messages about normal operation + :cvar WARN: Warning messages about potentially problematic situations + :cvar ERROR: Error messages indicating failures that don't stop execution + :cvar CRITICAL: Critical error messages indicating severe failures + """ + + VERBOSE = "verbose" + DEBUG = "debug" + INFO = "info" + WARN = "warn" + ERROR = "error" + CRITICAL = "critical" + + +class EventCommon(BaseModel): + """Common fields shared by all telemetry events. + :param trace_id: Unique identifier for the trace this event belongs to + :param span_id: Unique identifier for the span this event belongs to + :param timestamp: Timestamp when the event occurred + :param attributes: (Optional) Key-value pairs containing additional metadata about the event + """ + + trace_id: str + span_id: str + timestamp: datetime + attributes: dict[str, Primitive] | None = Field(default_factory=lambda: {}) + + +@json_schema_type +class UnstructuredLogEvent(EventCommon): + """An unstructured log event containing a simple text message. + :param type: Event type identifier set to UNSTRUCTURED_LOG + :param message: The log message text + :param severity: The severity level of the log message + """ + + type: Literal[EventType.UNSTRUCTURED_LOG] = EventType.UNSTRUCTURED_LOG + message: str + severity: LogSeverity + + +@json_schema_type +class MetricEvent(EventCommon): + """A metric event containing a measured value. + :param type: Event type identifier set to METRIC + :param metric: The name of the metric being measured + :param value: The numeric value of the metric measurement + :param unit: The unit of measurement for the metric value + """ + + type: Literal[EventType.METRIC] = EventType.METRIC + metric: str # this would be an enum + value: int | float + unit: str + + +@json_schema_type +class MetricInResponse(BaseModel): + """A metric value included in API responses. + :param metric: The name of the metric + :param value: The numeric value of the metric + :param unit: (Optional) The unit of measurement for the metric value + """ + + metric: str + value: int | float + unit: str | None = None + + +# This is a short term solution to allow inference API to return metrics +# The ideal way to do this is to have a way for all response types to include metrics +# and all metric events logged to the telemetry API to be included with the response +# To do this, we will need to augment all response types with a metrics field. +# We have hit a blocker from stainless SDK that prevents us from doing this. +# The blocker is that if we were to augment the response types that have a data field +# in them like so +# class ListModelsResponse(BaseModel): +# metrics: Optional[List[MetricEvent]] = None +# data: List[Models] +# ... +# The client SDK will need to access the data by using a .data field, which is not +# ergonomic. Stainless SDK does support unwrapping the response type, but it +# requires that the response type to only have a single field. + +# We will need a way in the client SDK to signal that the metrics are needed +# and if they are needed, the client SDK has to return the full response type +# without unwrapping it. + + +class MetricResponseMixin(BaseModel): + """Mixin class for API responses that can include metrics. + :param metrics: (Optional) List of metrics associated with the API response + """ + + metrics: list[MetricInResponse] | None = None + + +@json_schema_type +class StructuredLogType(Enum): + """The type of structured log event payload. + :cvar SPAN_START: Event indicating the start of a new span + :cvar SPAN_END: Event indicating the completion of a span + """ + + SPAN_START = "span_start" + SPAN_END = "span_end" + + +@json_schema_type +class SpanStartPayload(BaseModel): + """Payload for a span start event. + :param type: Payload type identifier set to SPAN_START + :param name: Human-readable name describing the operation this span represents + :param parent_span_id: (Optional) Unique identifier for the parent span, if this is a child span + """ + + type: Literal[StructuredLogType.SPAN_START] = StructuredLogType.SPAN_START + name: str + parent_span_id: str | None = None + + +@json_schema_type +class SpanEndPayload(BaseModel): + """Payload for a span end event. + :param type: Payload type identifier set to SPAN_END + :param status: The final status of the span indicating success or failure + """ + + type: Literal[StructuredLogType.SPAN_END] = StructuredLogType.SPAN_END + status: SpanStatus + + +StructuredLogPayload = Annotated[ + SpanStartPayload | SpanEndPayload, + Field(discriminator="type"), +] +register_schema(StructuredLogPayload, name="StructuredLogPayload") + + +@json_schema_type +class StructuredLogEvent(EventCommon): + """A structured log event containing typed payload data. + :param type: Event type identifier set to STRUCTURED_LOG + :param payload: The structured payload data for the log event + """ + + type: Literal[EventType.STRUCTURED_LOG] = EventType.STRUCTURED_LOG + payload: StructuredLogPayload + + +Event = Annotated[ + UnstructuredLogEvent | MetricEvent | StructuredLogEvent, + Field(discriminator="type"), +] +register_schema(Event, name="Event") + + +@json_schema_type +class EvalTrace(BaseModel): + """A trace record for evaluation purposes. + :param session_id: Unique identifier for the evaluation session + :param step: The evaluation step or phase identifier + :param input: The input data for the evaluation + :param output: The actual output produced during evaluation + :param expected_output: The expected output for comparison during evaluation + """ + + session_id: str + step: str + input: str + output: str + expected_output: str + + +@json_schema_type +class SpanWithStatus(Span): + """A span that includes status information. + :param status: (Optional) The current status of the span + """ + + status: SpanStatus | None = None + + +@json_schema_type +class QueryConditionOp(Enum): + """Comparison operators for query conditions. + :cvar EQ: Equal to comparison + :cvar NE: Not equal to comparison + :cvar GT: Greater than comparison + :cvar LT: Less than comparison + """ + + EQ = "eq" + NE = "ne" + GT = "gt" + LT = "lt" + + +@json_schema_type +class QueryCondition(BaseModel): + """A condition for filtering query results. + :param key: The attribute key to filter on + :param op: The comparison operator to apply + :param value: The value to compare against + """ + + key: str + op: QueryConditionOp + value: Any + + +class QueryTracesResponse(BaseModel): + """Response containing a list of traces. + :param data: List of traces matching the query criteria + """ + + data: list[Trace] + + +class QuerySpansResponse(BaseModel): + """Response containing a list of spans. + :param data: List of spans matching the query criteria + """ + + data: list[Span] + + +class QuerySpanTreeResponse(BaseModel): + """Response containing a tree structure of spans. + :param data: Dictionary mapping span IDs to spans with status information + """ + + data: dict[str, SpanWithStatus] + + +class MetricQueryType(Enum): + """The type of metric query to perform. + :cvar RANGE: Query metrics over a time range + :cvar INSTANT: Query metrics at a specific point in time + """ + + RANGE = "range" + INSTANT = "instant" + + +class MetricLabelOperator(Enum): + """Operators for matching metric labels. + :cvar EQUALS: Label value must equal the specified value + :cvar NOT_EQUALS: Label value must not equal the specified value + :cvar REGEX_MATCH: Label value must match the specified regular expression + :cvar REGEX_NOT_MATCH: Label value must not match the specified regular expression + """ + + EQUALS = "=" + NOT_EQUALS = "!=" + REGEX_MATCH = "=~" + REGEX_NOT_MATCH = "!~" + + +class MetricLabelMatcher(BaseModel): + """A matcher for filtering metrics by label values. + :param name: The name of the label to match + :param value: The value to match against + :param operator: The comparison operator to use for matching + """ + + name: str + value: str + operator: MetricLabelOperator = MetricLabelOperator.EQUALS + + +@json_schema_type +class MetricLabel(BaseModel): + """A label associated with a metric. + :param name: The name of the label + :param value: The value of the label + """ + + name: str + value: str + + +@json_schema_type +class MetricDataPoint(BaseModel): + """A single data point in a metric time series. + :param timestamp: Unix timestamp when the metric value was recorded + :param value: The numeric value of the metric at this timestamp + """ + + timestamp: int + value: float + unit: str + + +@json_schema_type +class MetricSeries(BaseModel): + """A time series of metric data points. + :param metric: The name of the metric + :param labels: List of labels associated with this metric series + :param values: List of data points in chronological order + """ + + metric: str + labels: list[MetricLabel] + values: list[MetricDataPoint] + + +class QueryMetricsResponse(BaseModel): + """Response containing metric time series data. + :param data: List of metric series matching the query criteria + """ + + data: list[MetricSeries] + _GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = { "active_spans": {}, @@ -49,7 +433,7 @@ def is_tracing_enabled(tracer): return span.is_recording() -class Telemetry(TelemetryBase): +class Telemetry: def __init__(self) -> None: self.meter = None diff --git a/src/llama_stack/core/telemetry/tracing.py b/src/llama_stack/core/telemetry/tracing.py index 7742ea0f4..a67cbe784 100644 --- a/src/llama_stack/core/telemetry/tracing.py +++ b/src/llama_stack/core/telemetry/tracing.py @@ -17,7 +17,8 @@ from datetime import UTC, datetime from functools import wraps from typing import Any, Self -from llama_stack.apis.telemetry import ( +from llama_stack.core.telemetry.telemetry import ( + ROOT_SPAN_MARKERS, Event, LogSeverity, Span, @@ -47,7 +48,6 @@ if not _fallback_logger.handlers: INVALID_SPAN_ID = 0x0000000000000000 INVALID_TRACE_ID = 0x00000000000000000000000000000000 -ROOT_SPAN_MARKERS = ["__root__", "__root_span__"] # The logical root span may not be visible to this process if a parent context # is passed in. The local root span is the first local span in a trace. LOCAL_ROOT_SPAN_MARKER = "__local_root_span__" diff --git a/src/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/src/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 9fd3f7d0e..80ef068c7 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -488,13 +488,13 @@ class ChatAgent(ShieldRunnerMixin): session_info = await self.storage.get_session_info(session_id) # if the session has a memory bank id, let the memory tool use it - if session_info and session_info.vector_db_id: + if session_info and session_info.vector_store_id: for tool_name in self.tool_name_to_args.keys(): if tool_name == MEMORY_QUERY_TOOL: - if "vector_db_ids" not in self.tool_name_to_args[tool_name]: - self.tool_name_to_args[tool_name]["vector_db_ids"] = [session_info.vector_db_id] + if "vector_store_ids" not in self.tool_name_to_args[tool_name]: + self.tool_name_to_args[tool_name]["vector_store_ids"] = [session_info.vector_store_id] else: - self.tool_name_to_args[tool_name]["vector_db_ids"].append(session_info.vector_db_id) + self.tool_name_to_args[tool_name]["vector_store_ids"].append(session_info.vector_store_id) output_attachments = [] diff --git a/src/llama_stack/providers/inline/agents/meta_reference/persistence.py b/src/llama_stack/providers/inline/agents/meta_reference/persistence.py index 3b7b4729c..26a2151e3 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/persistence.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/persistence.py @@ -22,7 +22,7 @@ log = get_logger(name=__name__, category="agents::meta_reference") class AgentSessionInfo(Session): # TODO: is this used anywhere? - vector_db_id: str | None = None + vector_store_id: str | None = None started_at: datetime owner: User | None = None identifier: str | None = None @@ -93,12 +93,12 @@ class AgentPersistence: return session_info - async def add_vector_db_to_session(self, session_id: str, vector_db_id: str): + async def add_vector_db_to_session(self, session_id: str, vector_store_id: str): session_info = await self.get_session_if_accessible(session_id) if session_info is None: raise SessionNotFoundError(session_id) - session_info.vector_db_id = vector_db_id + session_info.vector_store_id = vector_store_id await self.kvstore.set( key=f"session:{self.agent_id}:{session_id}", value=session_info.model_dump_json(), diff --git a/src/llama_stack/providers/inline/tool_runtime/rag/memory.py b/src/llama_stack/providers/inline/tool_runtime/rag/memory.py index dc3dfbbca..3ee745bf1 100644 --- a/src/llama_stack/providers/inline/tool_runtime/rag/memory.py +++ b/src/llama_stack/providers/inline/tool_runtime/rag/memory.py @@ -119,7 +119,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti async def insert( self, documents: list[RAGDocument], - vector_db_id: str, + vector_store_id: str, chunk_size_in_tokens: int = 512, ) -> None: if not documents: @@ -158,14 +158,14 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti try: await self.vector_io_api.openai_attach_file_to_vector_store( - vector_store_id=vector_db_id, + vector_store_id=vector_store_id, file_id=created_file.id, attributes=doc.metadata, chunking_strategy=chunking_strategy, ) except Exception as e: log.error( - f"Failed to attach file {created_file.id} to vector store {vector_db_id} for document {doc.document_id}: {e}" + f"Failed to attach file {created_file.id} to vector store {vector_store_id} for document {doc.document_id}: {e}" ) continue @@ -176,10 +176,10 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti async def query( self, content: InterleavedContent, - vector_db_ids: list[str], + vector_store_ids: list[str], query_config: RAGQueryConfig | None = None, ) -> RAGQueryResult: - if not vector_db_ids: + if not vector_store_ids: raise ValueError( "No vector DBs were provided to the knowledge search tool. Please provide at least one vector DB ID." ) @@ -192,7 +192,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti ) tasks = [ self.vector_io_api.query_chunks( - vector_db_id=vector_db_id, + vector_store_id=vector_store_id, query=query, params={ "mode": query_config.mode, @@ -201,18 +201,18 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti "ranker": query_config.ranker, }, ) - for vector_db_id in vector_db_ids + for vector_store_id in vector_store_ids ] results: list[QueryChunksResponse] = await asyncio.gather(*tasks) chunks = [] scores = [] - for vector_db_id, result in zip(vector_db_ids, results, strict=False): + for vector_store_id, result in zip(vector_store_ids, results, strict=False): for chunk, score in zip(result.chunks, result.scores, strict=False): if not hasattr(chunk, "metadata") or chunk.metadata is None: chunk.metadata = {} - chunk.metadata["vector_db_id"] = vector_db_id + chunk.metadata["vector_store_id"] = vector_store_id chunks.append(chunk) scores.append(score) @@ -250,7 +250,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti metadata_keys_to_exclude_from_context = [ "token_count", "metadata_token_count", - "vector_db_id", + "vector_store_id", ] metadata_for_context = {} for k in chunk_metadata_keys_to_include_from_context: @@ -275,7 +275,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti "document_ids": [c.document_id for c in chunks[: len(picked)]], "chunks": [c.content for c in chunks[: len(picked)]], "scores": scores[: len(picked)], - "vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]], + "vector_store_ids": [c.metadata["vector_store_id"] for c in chunks[: len(picked)]], }, ) @@ -309,7 +309,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti ) async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult: - vector_db_ids = kwargs.get("vector_db_ids", []) + vector_store_ids = kwargs.get("vector_store_ids", []) query_config = kwargs.get("query_config") if query_config: query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config) @@ -319,7 +319,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti query = kwargs["query"] result = await self.query( content=query, - vector_db_ids=vector_db_ids, + vector_store_ids=vector_store_ids, query_config=query_config, ) diff --git a/src/llama_stack/providers/inline/vector_io/faiss/faiss.py b/src/llama_stack/providers/inline/vector_io/faiss/faiss.py index 5e33d4ca3..9d8e282b0 100644 --- a/src/llama_stack/providers/inline/vector_io/faiss/faiss.py +++ b/src/llama_stack/providers/inline/vector_io/faiss/faiss.py @@ -248,19 +248,19 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoco del self.cache[vector_store_id] await self.kvstore.delete(f"{VECTOR_DBS_PREFIX}{vector_store_id}") - async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: - index = self.cache.get(vector_db_id) + async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: + index = self.cache.get(vector_store_id) if index is None: - raise ValueError(f"Vector DB {vector_db_id} not found. found: {self.cache.keys()}") + raise ValueError(f"Vector DB {vector_store_id} not found. found: {self.cache.keys()}") await index.insert_chunks(chunks) async def query_chunks( - self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None + self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None ) -> QueryChunksResponse: - index = self.cache.get(vector_db_id) + index = self.cache.get(vector_store_id) if index is None: - raise VectorStoreNotFoundError(vector_db_id) + raise VectorStoreNotFoundError(vector_store_id) return await index.query_chunks(query, params) diff --git a/src/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py b/src/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py index 37294f173..accf5cead 100644 --- a/src/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +++ b/src/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py @@ -447,20 +447,20 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresPro await self.cache[vector_store_id].index.delete() del self.cache[vector_store_id] - async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: - index = await self._get_and_cache_vector_store_index(vector_db_id) + async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: + index = await self._get_and_cache_vector_store_index(vector_store_id) if not index: - raise VectorStoreNotFoundError(vector_db_id) + raise VectorStoreNotFoundError(vector_store_id) # The VectorStoreWithIndex helper is expected to compute embeddings via the inference_api # and then call our index's add_chunks. await index.insert_chunks(chunks) async def query_chunks( - self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None + self, vector_store_id: str, query: Any, params: dict[str, Any] | None = None ) -> QueryChunksResponse: - index = await self._get_and_cache_vector_store_index(vector_db_id) + index = await self._get_and_cache_vector_store_index(vector_store_id) if not index: - raise VectorStoreNotFoundError(vector_db_id) + raise VectorStoreNotFoundError(vector_store_id) return await index.query_chunks(query, params) async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: diff --git a/src/llama_stack/providers/registry/vector_io.py b/src/llama_stack/providers/registry/vector_io.py index ff3b8486f..55b302751 100644 --- a/src/llama_stack/providers/registry/vector_io.py +++ b/src/llama_stack/providers/registry/vector_io.py @@ -163,14 +163,14 @@ The SQLite-vec provider supports three search modes: Example with hybrid search: ```python response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7}, ) # Using RRF ranker response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={ "mode": "hybrid", @@ -182,7 +182,7 @@ response = await vector_io.query_chunks( # Using weighted ranker response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={ "mode": "hybrid", @@ -196,7 +196,7 @@ response = await vector_io.query_chunks( Example with explicit vector search: ```python response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7}, ) @@ -205,7 +205,7 @@ response = await vector_io.query_chunks( Example with keyword search: ```python response = await vector_io.query_chunks( - vector_db_id="my_db", + vector_store_id="my_db", query="your query here", params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7}, ) diff --git a/src/llama_stack/providers/remote/vector_io/chroma/chroma.py b/src/llama_stack/providers/remote/vector_io/chroma/chroma.py index 2663ad43e..a4fd15f77 100644 --- a/src/llama_stack/providers/remote/vector_io/chroma/chroma.py +++ b/src/llama_stack/providers/remote/vector_io/chroma/chroma.py @@ -169,20 +169,20 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc await self.cache[vector_store_id].index.delete() del self.cache[vector_store_id] - async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: - index = await self._get_and_cache_vector_store_index(vector_db_id) + async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: + index = await self._get_and_cache_vector_store_index(vector_store_id) if index is None: - raise ValueError(f"Vector DB {vector_db_id} not found in Chroma") + raise ValueError(f"Vector DB {vector_store_id} not found in Chroma") await index.insert_chunks(chunks) async def query_chunks( - self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None + self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None ) -> QueryChunksResponse: - index = await self._get_and_cache_vector_store_index(vector_db_id) + index = await self._get_and_cache_vector_store_index(vector_store_id) if index is None: - raise ValueError(f"Vector DB {vector_db_id} not found in Chroma") + raise ValueError(f"Vector DB {vector_store_id} not found in Chroma") return await index.query_chunks(query, params) diff --git a/src/llama_stack/providers/remote/vector_io/milvus/milvus.py b/src/llama_stack/providers/remote/vector_io/milvus/milvus.py index cccf13816..ace9ab1c4 100644 --- a/src/llama_stack/providers/remote/vector_io/milvus/milvus.py +++ b/src/llama_stack/providers/remote/vector_io/milvus/milvus.py @@ -348,19 +348,19 @@ class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc await self.cache[vector_store_id].index.delete() del self.cache[vector_store_id] - async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: - index = await self._get_and_cache_vector_store_index(vector_db_id) + async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: + index = await self._get_and_cache_vector_store_index(vector_store_id) if not index: - raise VectorStoreNotFoundError(vector_db_id) + raise VectorStoreNotFoundError(vector_store_id) await index.insert_chunks(chunks) async def query_chunks( - self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None + self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None ) -> QueryChunksResponse: - index = await self._get_and_cache_vector_store_index(vector_db_id) + index = await self._get_and_cache_vector_store_index(vector_store_id) if not index: - raise VectorStoreNotFoundError(vector_db_id) + raise VectorStoreNotFoundError(vector_store_id) return await index.query_chunks(query, params) async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: diff --git a/src/llama_stack/providers/remote/vector_io/pgvector/pgvector.py b/src/llama_stack/providers/remote/vector_io/pgvector/pgvector.py index f28bd3cd9..29cfd673f 100644 --- a/src/llama_stack/providers/remote/vector_io/pgvector/pgvector.py +++ b/src/llama_stack/providers/remote/vector_io/pgvector/pgvector.py @@ -399,14 +399,14 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt assert self.kvstore is not None await self.kvstore.delete(key=f"{VECTOR_DBS_PREFIX}{vector_store_id}") - async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: - index = await self._get_and_cache_vector_store_index(vector_db_id) + async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: + index = await self._get_and_cache_vector_store_index(vector_store_id) await index.insert_chunks(chunks) async def query_chunks( - self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None + self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None ) -> QueryChunksResponse: - index = await self._get_and_cache_vector_store_index(vector_db_id) + index = await self._get_and_cache_vector_store_index(vector_store_id) return await index.query_chunks(query, params) async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex: diff --git a/src/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/src/llama_stack/providers/remote/vector_io/qdrant/qdrant.py index 93d0894a6..266e9bf58 100644 --- a/src/llama_stack/providers/remote/vector_io/qdrant/qdrant.py +++ b/src/llama_stack/providers/remote/vector_io/qdrant/qdrant.py @@ -222,19 +222,19 @@ class QdrantVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtoc self.cache[vector_store_id] = index return index - async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: - index = await self._get_and_cache_vector_store_index(vector_db_id) + async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: + index = await self._get_and_cache_vector_store_index(vector_store_id) if not index: - raise VectorStoreNotFoundError(vector_db_id) + raise VectorStoreNotFoundError(vector_store_id) await index.insert_chunks(chunks) async def query_chunks( - self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None + self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None ) -> QueryChunksResponse: - index = await self._get_and_cache_vector_store_index(vector_db_id) + index = await self._get_and_cache_vector_store_index(vector_store_id) if not index: - raise VectorStoreNotFoundError(vector_db_id) + raise VectorStoreNotFoundError(vector_store_id) return await index.query_chunks(query, params) diff --git a/src/llama_stack/providers/remote/vector_io/weaviate/weaviate.py b/src/llama_stack/providers/remote/vector_io/weaviate/weaviate.py index 66922aa3f..7813f6e5c 100644 --- a/src/llama_stack/providers/remote/vector_io/weaviate/weaviate.py +++ b/src/llama_stack/providers/remote/vector_io/weaviate/weaviate.py @@ -366,19 +366,19 @@ class WeaviateVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, NeedsRequestProv self.cache[vector_store_id] = index return index - async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: - index = await self._get_and_cache_vector_store_index(vector_db_id) + async def insert_chunks(self, vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None: + index = await self._get_and_cache_vector_store_index(vector_store_id) if not index: - raise VectorStoreNotFoundError(vector_db_id) + raise VectorStoreNotFoundError(vector_store_id) await index.insert_chunks(chunks) async def query_chunks( - self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None + self, vector_store_id: str, query: InterleavedContent, params: dict[str, Any] | None = None ) -> QueryChunksResponse: - index = await self._get_and_cache_vector_store_index(vector_db_id) + index = await self._get_and_cache_vector_store_index(vector_store_id) if not index: - raise VectorStoreNotFoundError(vector_db_id) + raise VectorStoreNotFoundError(vector_store_id) return await index.query_chunks(query, params) diff --git a/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py b/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py index 8f9fb9fb4..41d4cb2d7 100644 --- a/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py +++ b/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py @@ -333,7 +333,7 @@ class OpenAIVectorStoreMixin(ABC): @abstractmethod async def insert_chunks( self, - vector_db_id: str, + vector_store_id: str, chunks: list[Chunk], ttl_seconds: int | None = None, ) -> None: @@ -342,7 +342,7 @@ class OpenAIVectorStoreMixin(ABC): @abstractmethod async def query_chunks( - self, vector_db_id: str, query: Any, params: dict[str, Any] | None = None + self, vector_store_id: str, query: Any, params: dict[str, Any] | None = None ) -> QueryChunksResponse: """Query chunks from a vector database (provider-specific implementation).""" pass @@ -609,7 +609,7 @@ class OpenAIVectorStoreMixin(ABC): # TODO: Add support for ranking_options.ranker response = await self.query_chunks( - vector_db_id=vector_store_id, + vector_store_id=vector_store_id, query=search_query, params=params, ) @@ -803,7 +803,7 @@ class OpenAIVectorStoreMixin(ABC): ) else: await self.insert_chunks( - vector_db_id=vector_store_id, + vector_store_id=vector_store_id, chunks=chunks, ) vector_store_file_object.status = "completed" diff --git a/tests/integration/vector_io/test_openai_vector_stores.py b/tests/integration/vector_io/test_openai_vector_stores.py index 626faf42d..f2131c614 100644 --- a/tests/integration/vector_io/test_openai_vector_stores.py +++ b/tests/integration/vector_io/test_openai_vector_stores.py @@ -367,7 +367,7 @@ def test_openai_vector_store_with_chunks( # Insert chunks using the native LlamaStack API (since OpenAI API doesn't have direct chunk insertion) llama_client.vector_io.insert( - vector_db_id=vector_store.id, + vector_store_id=vector_store.id, chunks=sample_chunks, ) @@ -434,7 +434,7 @@ def test_openai_vector_store_search_relevance( # Insert chunks using native API llama_client.vector_io.insert( - vector_db_id=vector_store.id, + vector_store_id=vector_store.id, chunks=sample_chunks, ) @@ -484,7 +484,7 @@ def test_openai_vector_store_search_with_ranking_options( # Insert chunks llama_client.vector_io.insert( - vector_db_id=vector_store.id, + vector_store_id=vector_store.id, chunks=sample_chunks, ) @@ -544,7 +544,7 @@ def test_openai_vector_store_search_with_high_score_filter( # Insert chunks llama_client.vector_io.insert( - vector_db_id=vector_store.id, + vector_store_id=vector_store.id, chunks=sample_chunks, ) @@ -610,7 +610,7 @@ def test_openai_vector_store_search_with_max_num_results( # Insert chunks llama_client.vector_io.insert( - vector_db_id=vector_store.id, + vector_store_id=vector_store.id, chunks=sample_chunks, ) @@ -1175,7 +1175,7 @@ def test_openai_vector_store_search_modes( ) client_with_models.vector_io.insert( - vector_db_id=vector_store.id, + vector_store_id=vector_store.id, chunks=sample_chunks, ) query = "Python programming language" diff --git a/tests/integration/vector_io/test_vector_io.py b/tests/integration/vector_io/test_vector_io.py index 1f67ddb24..a312456b9 100644 --- a/tests/integration/vector_io/test_vector_io.py +++ b/tests/integration/vector_io/test_vector_io.py @@ -123,12 +123,12 @@ def test_insert_chunks( actual_vector_store_id = create_response.id client_with_empty_registry.vector_io.insert( - vector_db_id=actual_vector_store_id, + vector_store_id=actual_vector_store_id, chunks=sample_chunks, ) response = client_with_empty_registry.vector_io.query( - vector_db_id=actual_vector_store_id, + vector_store_id=actual_vector_store_id, query="What is the capital of France?", ) assert response is not None @@ -137,7 +137,7 @@ def test_insert_chunks( query, expected_doc_id = test_case response = client_with_empty_registry.vector_io.query( - vector_db_id=actual_vector_store_id, + vector_store_id=actual_vector_store_id, query=query, ) assert response is not None @@ -174,13 +174,13 @@ def test_insert_chunks_with_precomputed_embeddings( ] client_with_empty_registry.vector_io.insert( - vector_db_id=actual_vector_store_id, + vector_store_id=actual_vector_store_id, chunks=chunks_with_embeddings, ) provider = [p.provider_id for p in client_with_empty_registry.providers.list() if p.api == "vector_io"][0] response = client_with_empty_registry.vector_io.query( - vector_db_id=actual_vector_store_id, + vector_store_id=actual_vector_store_id, query="precomputed embedding test", params=vector_io_provider_params_dict.get(provider, None), ) @@ -224,13 +224,13 @@ def test_query_returns_valid_object_when_identical_to_embedding_in_vdb( ] client_with_empty_registry.vector_io.insert( - vector_db_id=actual_vector_store_id, + vector_store_id=actual_vector_store_id, chunks=chunks_with_embeddings, ) provider = [p.provider_id for p in client_with_empty_registry.providers.list() if p.api == "vector_io"][0] response = client_with_empty_registry.vector_io.query( - vector_db_id=actual_vector_store_id, + vector_store_id=actual_vector_store_id, query="duplicate", params=vector_io_provider_params_dict.get(provider, None), ) diff --git a/tests/unit/rag/test_rag_query.py b/tests/unit/rag/test_rag_query.py index c012bc4f0..45b194332 100644 --- a/tests/unit/rag/test_rag_query.py +++ b/tests/unit/rag/test_rag_query.py @@ -23,14 +23,14 @@ class TestRagQuery: config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock(), files_api=MagicMock() ) with pytest.raises(ValueError): - await rag_tool.query(content=MagicMock(), vector_db_ids=[]) + await rag_tool.query(content=MagicMock(), vector_store_ids=[]) async def test_query_chunk_metadata_handling(self): rag_tool = MemoryToolRuntimeImpl( config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock(), files_api=MagicMock() ) content = "test query content" - vector_db_ids = ["db1"] + vector_store_ids = ["db1"] chunk_metadata = ChunkMetadata( document_id="doc1", @@ -55,7 +55,7 @@ class TestRagQuery: query_response = QueryChunksResponse(chunks=[chunk], scores=[1.0]) rag_tool.vector_io_api.query_chunks = AsyncMock(return_value=query_response) - result = await rag_tool.query(content=content, vector_db_ids=vector_db_ids) + result = await rag_tool.query(content=content, vector_store_ids=vector_store_ids) assert result is not None expected_metadata_string = ( @@ -90,7 +90,7 @@ class TestRagQuery: files_api=MagicMock(), ) - vector_db_ids = ["db1", "db2"] + vector_store_ids = ["db1", "db2"] # Fake chunks from each DB chunk_metadata1 = ChunkMetadata( @@ -101,7 +101,7 @@ class TestRagQuery: ) chunk1 = Chunk( content="chunk from db1", - metadata={"vector_db_id": "db1", "document_id": "doc1"}, + metadata={"vector_store_id": "db1", "document_id": "doc1"}, stored_chunk_id="c1", chunk_metadata=chunk_metadata1, ) @@ -114,7 +114,7 @@ class TestRagQuery: ) chunk2 = Chunk( content="chunk from db2", - metadata={"vector_db_id": "db2", "document_id": "doc2"}, + metadata={"vector_store_id": "db2", "document_id": "doc2"}, stored_chunk_id="c2", chunk_metadata=chunk_metadata2, ) @@ -126,13 +126,13 @@ class TestRagQuery: ] ) - result = await rag_tool.query(content="test", vector_db_ids=vector_db_ids) + result = await rag_tool.query(content="test", vector_store_ids=vector_store_ids) returned_chunks = result.metadata["chunks"] returned_scores = result.metadata["scores"] returned_doc_ids = result.metadata["document_ids"] - returned_vector_db_ids = result.metadata["vector_db_ids"] + returned_vector_store_ids = result.metadata["vector_store_ids"] assert returned_chunks == ["chunk from db1", "chunk from db2"] assert returned_scores == (0.9, 0.8) assert returned_doc_ids == ["doc1", "doc2"] - assert returned_vector_db_ids == ["db1", "db2"] + assert returned_vector_store_ids == ["db1", "db2"]