mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-18 16:29:47 +00:00
Merge branch 'main' into add-nvidia-nim-to-docs
This commit is contained in:
commit
7652db75b4
181 changed files with 7358 additions and 749 deletions
|
|
@ -9,3 +9,4 @@ sphinx-tabs
|
|||
sphinx-design
|
||||
sphinxcontrib-openapi
|
||||
sphinxcontrib-redoc
|
||||
sphinxcontrib-mermaid
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@
|
|||
"info": {
|
||||
"title": "Llama Stack Specification",
|
||||
"version": "alpha",
|
||||
"description": "This is the specification of the Llama Stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. Generated at 2024-11-22 17:23:55.034164"
|
||||
"description": "This is the specification of the Llama Stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models."
|
||||
},
|
||||
"servers": [
|
||||
{
|
||||
|
|
@ -29,6 +29,39 @@
|
|||
}
|
||||
],
|
||||
"paths": {
|
||||
"/alpha/datasetio/append-rows": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"DatasetIO"
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "X-LlamaStack-ProviderData",
|
||||
"in": "header",
|
||||
"description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/AppendRowsRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/alpha/batch-inference/chat-completion": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
|
@ -1026,15 +1059,15 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"/alpha/telemetry/get-trace": {
|
||||
"get": {
|
||||
"/alpha/telemetry/get-span-tree": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/Trace"
|
||||
"$ref": "#/components/schemas/SpanWithChildren"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1045,13 +1078,21 @@
|
|||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "trace_id",
|
||||
"name": "span_id",
|
||||
"in": "query",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "max_depth",
|
||||
"in": "query",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "X-LlamaStack-ProviderData",
|
||||
"in": "header",
|
||||
|
|
@ -1061,7 +1102,17 @@
|
|||
"type": "string"
|
||||
}
|
||||
}
|
||||
]
|
||||
],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/GetSpanTreeRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/alpha/post-training/job/artifacts": {
|
||||
|
|
@ -1778,6 +1829,86 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/alpha/telemetry/query-spans": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"content": {
|
||||
"application/jsonl": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/Span"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Telemetry"
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "X-LlamaStack-ProviderData",
|
||||
"in": "header",
|
||||
"description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/QuerySpansRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/alpha/telemetry/query-traces": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK",
|
||||
"content": {
|
||||
"application/jsonl": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/Trace"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Telemetry"
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "X-LlamaStack-ProviderData",
|
||||
"in": "header",
|
||||
"description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/QueryTracesRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/alpha/datasets/register": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
|
@ -2066,6 +2197,39 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/alpha/telemetry/save-spans-to-dataset": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Telemetry"
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "X-LlamaStack-ProviderData",
|
||||
"in": "header",
|
||||
"description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/SaveSpansToDatasetRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/alpha/scoring/score": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
|
@ -2226,6 +2390,39 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"/alpha/datasets/unregister": {
|
||||
"post": {
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "OK"
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"Datasets"
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "X-LlamaStack-ProviderData",
|
||||
"in": "header",
|
||||
"description": "JSON-encoded provider data which will be made available to the adapter servicing the API",
|
||||
"required": false,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/UnregisterDatasetRequest"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"/alpha/memory-banks/unregister": {
|
||||
"post": {
|
||||
"responses": {
|
||||
|
|
@ -2296,6 +2493,47 @@
|
|||
"jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
|
||||
"components": {
|
||||
"schemas": {
|
||||
"AppendRowsRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"dataset_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"rows": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"dataset_id",
|
||||
"rows"
|
||||
]
|
||||
},
|
||||
"BuiltinTool": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
|
|
@ -4130,14 +4368,11 @@
|
|||
"step_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"model_response_text_delta": {
|
||||
"text_delta": {
|
||||
"type": "string"
|
||||
},
|
||||
"tool_call_delta": {
|
||||
"$ref": "#/components/schemas/ToolCallDelta"
|
||||
},
|
||||
"tool_response_text_delta": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
@ -5845,13 +6080,38 @@
|
|||
],
|
||||
"title": "A safety shield resource that can be used to check content"
|
||||
},
|
||||
"Trace": {
|
||||
"GetSpanTreeRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"attributes_to_return": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
},
|
||||
"SpanStatus": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"ok",
|
||||
"error"
|
||||
]
|
||||
},
|
||||
"SpanWithChildren": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"span_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"trace_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"root_span_id": {
|
||||
"parent_span_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"start_time": {
|
||||
|
|
@ -5861,13 +6121,49 @@
|
|||
"end_time": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"attributes": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"children": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/SpanWithChildren"
|
||||
}
|
||||
},
|
||||
"status": {
|
||||
"$ref": "#/components/schemas/SpanStatus"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"span_id",
|
||||
"trace_id",
|
||||
"root_span_id",
|
||||
"start_time"
|
||||
"name",
|
||||
"start_time",
|
||||
"children"
|
||||
]
|
||||
},
|
||||
"Checkpoint": {
|
||||
|
|
@ -6280,13 +6576,6 @@
|
|||
"name"
|
||||
]
|
||||
},
|
||||
"SpanStatus": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"ok",
|
||||
"error"
|
||||
]
|
||||
},
|
||||
"StructuredLogEvent": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -6425,11 +6714,15 @@
|
|||
"$ref": "#/components/schemas/StructuredLogEvent"
|
||||
}
|
||||
]
|
||||
},
|
||||
"ttl_seconds": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"event"
|
||||
"event",
|
||||
"ttl_seconds"
|
||||
]
|
||||
},
|
||||
"DPOAlignmentConfig": {
|
||||
|
|
@ -6739,6 +7032,185 @@
|
|||
"scores"
|
||||
]
|
||||
},
|
||||
"QueryCondition": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"key": {
|
||||
"type": "string"
|
||||
},
|
||||
"op": {
|
||||
"$ref": "#/components/schemas/QueryConditionOp"
|
||||
},
|
||||
"value": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"key",
|
||||
"op",
|
||||
"value"
|
||||
]
|
||||
},
|
||||
"QueryConditionOp": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"eq",
|
||||
"ne",
|
||||
"gt",
|
||||
"lt"
|
||||
]
|
||||
},
|
||||
"QuerySpansRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"attribute_filters": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/QueryCondition"
|
||||
}
|
||||
},
|
||||
"attributes_to_return": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"max_depth": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"attribute_filters",
|
||||
"attributes_to_return"
|
||||
]
|
||||
},
|
||||
"Span": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"span_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"trace_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"parent_span_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"start_time": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"end_time": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"attributes": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "null"
|
||||
},
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number"
|
||||
},
|
||||
{
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"type": "array"
|
||||
},
|
||||
{
|
||||
"type": "object"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"span_id",
|
||||
"trace_id",
|
||||
"name",
|
||||
"start_time"
|
||||
]
|
||||
},
|
||||
"QueryTracesRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"attribute_filters": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/QueryCondition"
|
||||
}
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer"
|
||||
},
|
||||
"offset": {
|
||||
"type": "integer"
|
||||
},
|
||||
"order_by": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
},
|
||||
"Trace": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"trace_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"root_span_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"start_time": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"end_time": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"trace_id",
|
||||
"root_span_id",
|
||||
"start_time"
|
||||
]
|
||||
},
|
||||
"RegisterDatasetRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -7455,6 +7927,35 @@
|
|||
},
|
||||
"additionalProperties": false
|
||||
},
|
||||
"SaveSpansToDatasetRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"attribute_filters": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/QueryCondition"
|
||||
}
|
||||
},
|
||||
"attributes_to_save": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"dataset_id": {
|
||||
"type": "string"
|
||||
},
|
||||
"max_depth": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"attribute_filters",
|
||||
"attributes_to_save",
|
||||
"dataset_id"
|
||||
]
|
||||
},
|
||||
"ScoreRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -7894,6 +8395,18 @@
|
|||
],
|
||||
"title": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."
|
||||
},
|
||||
"UnregisterDatasetRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"dataset_id": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"dataset_id"
|
||||
]
|
||||
},
|
||||
"UnregisterMemoryBankRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -7982,6 +8495,10 @@
|
|||
"name": "AppEvalTaskConfig",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/AppEvalTaskConfig\" />"
|
||||
},
|
||||
{
|
||||
"name": "AppendRowsRequest",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/AppendRowsRequest\" />"
|
||||
},
|
||||
{
|
||||
"name": "Attachment",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/Attachment\" />"
|
||||
|
|
@ -8137,6 +8654,10 @@
|
|||
"name": "GetAgentsSessionRequest",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/GetAgentsSessionRequest\" />"
|
||||
},
|
||||
{
|
||||
"name": "GetSpanTreeRequest",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/GetSpanTreeRequest\" />"
|
||||
},
|
||||
{
|
||||
"name": "GraphMemoryBank",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/GraphMemoryBank\" />"
|
||||
|
|
@ -8291,6 +8812,14 @@
|
|||
"name": "QLoraFinetuningConfig",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/QLoraFinetuningConfig\" />"
|
||||
},
|
||||
{
|
||||
"name": "QueryCondition",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/QueryCondition\" />"
|
||||
},
|
||||
{
|
||||
"name": "QueryConditionOp",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/QueryConditionOp\" />"
|
||||
},
|
||||
{
|
||||
"name": "QueryDocumentsRequest",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/QueryDocumentsRequest\" />"
|
||||
|
|
@ -8299,6 +8828,14 @@
|
|||
"name": "QueryDocumentsResponse",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/QueryDocumentsResponse\" />"
|
||||
},
|
||||
{
|
||||
"name": "QuerySpansRequest",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/QuerySpansRequest\" />"
|
||||
},
|
||||
{
|
||||
"name": "QueryTracesRequest",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/QueryTracesRequest\" />"
|
||||
},
|
||||
{
|
||||
"name": "RLHFAlgorithm",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/RLHFAlgorithm\" />"
|
||||
|
|
@ -8370,6 +8907,10 @@
|
|||
"name": "SamplingStrategy",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/SamplingStrategy\" />"
|
||||
},
|
||||
{
|
||||
"name": "SaveSpansToDatasetRequest",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/SaveSpansToDatasetRequest\" />"
|
||||
},
|
||||
{
|
||||
"name": "ScoreBatchRequest",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/ScoreBatchRequest\" />"
|
||||
|
|
@ -8419,6 +8960,10 @@
|
|||
{
|
||||
"name": "Shields"
|
||||
},
|
||||
{
|
||||
"name": "Span",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/Span\" />"
|
||||
},
|
||||
{
|
||||
"name": "SpanEndPayload",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/SpanEndPayload\" />"
|
||||
|
|
@ -8431,6 +8976,10 @@
|
|||
"name": "SpanStatus",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/SpanStatus\" />"
|
||||
},
|
||||
{
|
||||
"name": "SpanWithChildren",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/SpanWithChildren\" />"
|
||||
},
|
||||
{
|
||||
"name": "StopReason",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/StopReason\" />"
|
||||
|
|
@ -8521,6 +9070,10 @@
|
|||
"name": "URL",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/URL\" />"
|
||||
},
|
||||
{
|
||||
"name": "UnregisterDatasetRequest",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/UnregisterDatasetRequest\" />"
|
||||
},
|
||||
{
|
||||
"name": "UnregisterMemoryBankRequest",
|
||||
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/UnregisterMemoryBankRequest\" />"
|
||||
|
|
@ -8594,6 +9147,7 @@
|
|||
"AgentTurnResponseTurnCompletePayload",
|
||||
"AgentTurnResponseTurnStartPayload",
|
||||
"AppEvalTaskConfig",
|
||||
"AppendRowsRequest",
|
||||
"Attachment",
|
||||
"BatchChatCompletionRequest",
|
||||
"BatchChatCompletionResponse",
|
||||
|
|
@ -8629,6 +9183,7 @@
|
|||
"FinetuningAlgorithm",
|
||||
"FunctionCallToolDefinition",
|
||||
"GetAgentsSessionRequest",
|
||||
"GetSpanTreeRequest",
|
||||
"GraphMemoryBank",
|
||||
"GraphMemoryBankParams",
|
||||
"HealthInfo",
|
||||
|
|
@ -8663,8 +9218,12 @@
|
|||
"PreferenceOptimizeRequest",
|
||||
"ProviderInfo",
|
||||
"QLoraFinetuningConfig",
|
||||
"QueryCondition",
|
||||
"QueryConditionOp",
|
||||
"QueryDocumentsRequest",
|
||||
"QueryDocumentsResponse",
|
||||
"QuerySpansRequest",
|
||||
"QueryTracesRequest",
|
||||
"RLHFAlgorithm",
|
||||
"RegexParserScoringFnParams",
|
||||
"RegisterDatasetRequest",
|
||||
|
|
@ -8682,6 +9241,7 @@
|
|||
"SafetyViolation",
|
||||
"SamplingParams",
|
||||
"SamplingStrategy",
|
||||
"SaveSpansToDatasetRequest",
|
||||
"ScoreBatchRequest",
|
||||
"ScoreBatchResponse",
|
||||
"ScoreRequest",
|
||||
|
|
@ -8692,9 +9252,11 @@
|
|||
"Session",
|
||||
"Shield",
|
||||
"ShieldCallStep",
|
||||
"Span",
|
||||
"SpanEndPayload",
|
||||
"SpanStartPayload",
|
||||
"SpanStatus",
|
||||
"SpanWithChildren",
|
||||
"StopReason",
|
||||
"StructuredLogEvent",
|
||||
"SupervisedFineTuneRequest",
|
||||
|
|
@ -8716,6 +9278,7 @@
|
|||
"TrainingConfig",
|
||||
"Turn",
|
||||
"URL",
|
||||
"UnregisterDatasetRequest",
|
||||
"UnregisterMemoryBankRequest",
|
||||
"UnregisterModelRequest",
|
||||
"UnstructuredLogEvent",
|
||||
|
|
|
|||
|
|
@ -132,8 +132,6 @@ components:
|
|||
const: step_progress
|
||||
default: step_progress
|
||||
type: string
|
||||
model_response_text_delta:
|
||||
type: string
|
||||
step_id:
|
||||
type: string
|
||||
step_type:
|
||||
|
|
@ -143,10 +141,10 @@ components:
|
|||
- shield_call
|
||||
- memory_retrieval
|
||||
type: string
|
||||
text_delta:
|
||||
type: string
|
||||
tool_call_delta:
|
||||
$ref: '#/components/schemas/ToolCallDelta'
|
||||
tool_response_text_delta:
|
||||
type: string
|
||||
required:
|
||||
- event_type
|
||||
- step_type
|
||||
|
|
@ -242,6 +240,27 @@ components:
|
|||
- eval_candidate
|
||||
- scoring_params
|
||||
type: object
|
||||
AppendRowsRequest:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
dataset_id:
|
||||
type: string
|
||||
rows:
|
||||
items:
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
type: object
|
||||
type: array
|
||||
required:
|
||||
- dataset_id
|
||||
- rows
|
||||
type: object
|
||||
Attachment:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
|
|
@ -1059,6 +1078,14 @@ components:
|
|||
type: string
|
||||
type: array
|
||||
type: object
|
||||
GetSpanTreeRequest:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
attributes_to_return:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
GraphMemoryBank:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
|
|
@ -1277,8 +1304,11 @@ components:
|
|||
- $ref: '#/components/schemas/UnstructuredLogEvent'
|
||||
- $ref: '#/components/schemas/MetricEvent'
|
||||
- $ref: '#/components/schemas/StructuredLogEvent'
|
||||
ttl_seconds:
|
||||
type: integer
|
||||
required:
|
||||
- event
|
||||
- ttl_seconds
|
||||
type: object
|
||||
LogSeverity:
|
||||
enum:
|
||||
|
|
@ -1825,6 +1855,33 @@ components:
|
|||
- rank
|
||||
- alpha
|
||||
type: object
|
||||
QueryCondition:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
key:
|
||||
type: string
|
||||
op:
|
||||
$ref: '#/components/schemas/QueryConditionOp'
|
||||
value:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
required:
|
||||
- key
|
||||
- op
|
||||
- value
|
||||
type: object
|
||||
QueryConditionOp:
|
||||
enum:
|
||||
- eq
|
||||
- ne
|
||||
- gt
|
||||
- lt
|
||||
type: string
|
||||
QueryDocumentsRequest:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
|
|
@ -1887,6 +1944,39 @@ components:
|
|||
- chunks
|
||||
- scores
|
||||
type: object
|
||||
QuerySpansRequest:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
attribute_filters:
|
||||
items:
|
||||
$ref: '#/components/schemas/QueryCondition'
|
||||
type: array
|
||||
attributes_to_return:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
max_depth:
|
||||
type: integer
|
||||
required:
|
||||
- attribute_filters
|
||||
- attributes_to_return
|
||||
type: object
|
||||
QueryTracesRequest:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
attribute_filters:
|
||||
items:
|
||||
$ref: '#/components/schemas/QueryCondition'
|
||||
type: array
|
||||
limit:
|
||||
type: integer
|
||||
offset:
|
||||
type: integer
|
||||
order_by:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
RLHFAlgorithm:
|
||||
enum:
|
||||
- dpo
|
||||
|
|
@ -2392,6 +2482,26 @@ components:
|
|||
- top_p
|
||||
- top_k
|
||||
type: string
|
||||
SaveSpansToDatasetRequest:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
attribute_filters:
|
||||
items:
|
||||
$ref: '#/components/schemas/QueryCondition'
|
||||
type: array
|
||||
attributes_to_save:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
dataset_id:
|
||||
type: string
|
||||
max_depth:
|
||||
type: integer
|
||||
required:
|
||||
- attribute_filters
|
||||
- attributes_to_save
|
||||
- dataset_id
|
||||
type: object
|
||||
ScoreBatchRequest:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
|
|
@ -2731,6 +2841,39 @@ components:
|
|||
- step_id
|
||||
- step_type
|
||||
type: object
|
||||
Span:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
attributes:
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
type: object
|
||||
end_time:
|
||||
format: date-time
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
parent_span_id:
|
||||
type: string
|
||||
span_id:
|
||||
type: string
|
||||
start_time:
|
||||
format: date-time
|
||||
type: string
|
||||
trace_id:
|
||||
type: string
|
||||
required:
|
||||
- span_id
|
||||
- trace_id
|
||||
- name
|
||||
- start_time
|
||||
type: object
|
||||
SpanEndPayload:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
|
|
@ -2764,6 +2907,46 @@ components:
|
|||
- ok
|
||||
- error
|
||||
type: string
|
||||
SpanWithChildren:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
attributes:
|
||||
additionalProperties:
|
||||
oneOf:
|
||||
- type: 'null'
|
||||
- type: boolean
|
||||
- type: number
|
||||
- type: string
|
||||
- type: array
|
||||
- type: object
|
||||
type: object
|
||||
children:
|
||||
items:
|
||||
$ref: '#/components/schemas/SpanWithChildren'
|
||||
type: array
|
||||
end_time:
|
||||
format: date-time
|
||||
type: string
|
||||
name:
|
||||
type: string
|
||||
parent_span_id:
|
||||
type: string
|
||||
span_id:
|
||||
type: string
|
||||
start_time:
|
||||
format: date-time
|
||||
type: string
|
||||
status:
|
||||
$ref: '#/components/schemas/SpanStatus'
|
||||
trace_id:
|
||||
type: string
|
||||
required:
|
||||
- span_id
|
||||
- trace_id
|
||||
- name
|
||||
- start_time
|
||||
- children
|
||||
type: object
|
||||
StopReason:
|
||||
enum:
|
||||
- end_of_turn
|
||||
|
|
@ -3237,6 +3420,14 @@ components:
|
|||
format: uri
|
||||
pattern: ^(https?://|file://|data:)
|
||||
type: string
|
||||
UnregisterDatasetRequest:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
dataset_id:
|
||||
type: string
|
||||
required:
|
||||
- dataset_id
|
||||
type: object
|
||||
UnregisterMemoryBankRequest:
|
||||
additionalProperties: false
|
||||
properties:
|
||||
|
|
@ -3400,7 +3591,7 @@ components:
|
|||
info:
|
||||
description: "This is the specification of the Llama Stack that provides\n \
|
||||
\ a set of endpoints and their corresponding interfaces that are tailored\
|
||||
\ to\n best leverage Llama Models. Generated at 2024-11-22 17:23:55.034164"
|
||||
\ to\n best leverage Llama Models."
|
||||
title: Llama Stack Specification
|
||||
version: alpha
|
||||
jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
|
||||
|
|
@ -3684,6 +3875,27 @@ paths:
|
|||
description: OK
|
||||
tags:
|
||||
- BatchInference (Coming Soon)
|
||||
/alpha/datasetio/append-rows:
|
||||
post:
|
||||
parameters:
|
||||
- description: JSON-encoded provider data which will be made available to the
|
||||
adapter servicing the API
|
||||
in: header
|
||||
name: X-LlamaStack-ProviderData
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/AppendRowsRequest'
|
||||
required: true
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
tags:
|
||||
- DatasetIO
|
||||
/alpha/datasetio/get-rows-paginated:
|
||||
get:
|
||||
parameters:
|
||||
|
|
@ -3789,6 +4001,27 @@ paths:
|
|||
description: OK
|
||||
tags:
|
||||
- Datasets
|
||||
/alpha/datasets/unregister:
|
||||
post:
|
||||
parameters:
|
||||
- description: JSON-encoded provider data which will be made available to the
|
||||
adapter servicing the API
|
||||
in: header
|
||||
name: X-LlamaStack-ProviderData
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/UnregisterDatasetRequest'
|
||||
required: true
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
tags:
|
||||
- Datasets
|
||||
/alpha/eval-tasks/get:
|
||||
get:
|
||||
parameters:
|
||||
|
|
@ -4756,14 +4989,19 @@ paths:
|
|||
description: OK
|
||||
tags:
|
||||
- SyntheticDataGeneration (Coming Soon)
|
||||
/alpha/telemetry/get-trace:
|
||||
get:
|
||||
/alpha/telemetry/get-span-tree:
|
||||
post:
|
||||
parameters:
|
||||
- in: query
|
||||
name: trace_id
|
||||
name: span_id
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- in: query
|
||||
name: max_depth
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
- description: JSON-encoded provider data which will be made available to the
|
||||
adapter servicing the API
|
||||
in: header
|
||||
|
|
@ -4771,12 +5009,18 @@ paths:
|
|||
required: false
|
||||
schema:
|
||||
type: string
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/GetSpanTreeRequest'
|
||||
required: true
|
||||
responses:
|
||||
'200':
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/Trace'
|
||||
$ref: '#/components/schemas/SpanWithChildren'
|
||||
description: OK
|
||||
tags:
|
||||
- Telemetry
|
||||
|
|
@ -4801,6 +5045,77 @@ paths:
|
|||
description: OK
|
||||
tags:
|
||||
- Telemetry
|
||||
/alpha/telemetry/query-spans:
|
||||
post:
|
||||
parameters:
|
||||
- description: JSON-encoded provider data which will be made available to the
|
||||
adapter servicing the API
|
||||
in: header
|
||||
name: X-LlamaStack-ProviderData
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/QuerySpansRequest'
|
||||
required: true
|
||||
responses:
|
||||
'200':
|
||||
content:
|
||||
application/jsonl:
|
||||
schema:
|
||||
$ref: '#/components/schemas/Span'
|
||||
description: OK
|
||||
tags:
|
||||
- Telemetry
|
||||
/alpha/telemetry/query-traces:
|
||||
post:
|
||||
parameters:
|
||||
- description: JSON-encoded provider data which will be made available to the
|
||||
adapter servicing the API
|
||||
in: header
|
||||
name: X-LlamaStack-ProviderData
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/QueryTracesRequest'
|
||||
required: true
|
||||
responses:
|
||||
'200':
|
||||
content:
|
||||
application/jsonl:
|
||||
schema:
|
||||
$ref: '#/components/schemas/Trace'
|
||||
description: OK
|
||||
tags:
|
||||
- Telemetry
|
||||
/alpha/telemetry/save-spans-to-dataset:
|
||||
post:
|
||||
parameters:
|
||||
- description: JSON-encoded provider data which will be made available to the
|
||||
adapter servicing the API
|
||||
in: header
|
||||
name: X-LlamaStack-ProviderData
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/SaveSpansToDatasetRequest'
|
||||
required: true
|
||||
responses:
|
||||
'200':
|
||||
description: OK
|
||||
tags:
|
||||
- Telemetry
|
||||
security:
|
||||
- Default: []
|
||||
servers:
|
||||
|
|
@ -4849,6 +5164,9 @@ tags:
|
|||
- description: <SchemaDefinition schemaRef="#/components/schemas/AppEvalTaskConfig"
|
||||
/>
|
||||
name: AppEvalTaskConfig
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/AppendRowsRequest"
|
||||
/>
|
||||
name: AppendRowsRequest
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/Attachment" />
|
||||
name: Attachment
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/BatchChatCompletionRequest"
|
||||
|
|
@ -4970,6 +5288,9 @@ tags:
|
|||
- description: <SchemaDefinition schemaRef="#/components/schemas/GetAgentsSessionRequest"
|
||||
/>
|
||||
name: GetAgentsSessionRequest
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/GetSpanTreeRequest"
|
||||
/>
|
||||
name: GetSpanTreeRequest
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/GraphMemoryBank"
|
||||
/>
|
||||
name: GraphMemoryBank
|
||||
|
|
@ -5076,12 +5397,23 @@ tags:
|
|||
- description: <SchemaDefinition schemaRef="#/components/schemas/QLoraFinetuningConfig"
|
||||
/>
|
||||
name: QLoraFinetuningConfig
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/QueryCondition" />
|
||||
name: QueryCondition
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/QueryConditionOp"
|
||||
/>
|
||||
name: QueryConditionOp
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/QueryDocumentsRequest"
|
||||
/>
|
||||
name: QueryDocumentsRequest
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/QueryDocumentsResponse"
|
||||
/>
|
||||
name: QueryDocumentsResponse
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/QuerySpansRequest"
|
||||
/>
|
||||
name: QuerySpansRequest
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/QueryTracesRequest"
|
||||
/>
|
||||
name: QueryTracesRequest
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/RLHFAlgorithm" />
|
||||
name: RLHFAlgorithm
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/RegexParserScoringFnParams"
|
||||
|
|
@ -5129,6 +5461,9 @@ tags:
|
|||
- description: <SchemaDefinition schemaRef="#/components/schemas/SamplingStrategy"
|
||||
/>
|
||||
name: SamplingStrategy
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/SaveSpansToDatasetRequest"
|
||||
/>
|
||||
name: SaveSpansToDatasetRequest
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/ScoreBatchRequest"
|
||||
/>
|
||||
name: ScoreBatchRequest
|
||||
|
|
@ -5161,6 +5496,8 @@ tags:
|
|||
- description: <SchemaDefinition schemaRef="#/components/schemas/ShieldCallStep" />
|
||||
name: ShieldCallStep
|
||||
- name: Shields
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/Span" />
|
||||
name: Span
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/SpanEndPayload" />
|
||||
name: SpanEndPayload
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/SpanStartPayload"
|
||||
|
|
@ -5168,6 +5505,9 @@ tags:
|
|||
name: SpanStartPayload
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/SpanStatus" />
|
||||
name: SpanStatus
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/SpanWithChildren"
|
||||
/>
|
||||
name: SpanWithChildren
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/StopReason" />
|
||||
name: StopReason
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/StructuredLogEvent"
|
||||
|
|
@ -5236,6 +5576,9 @@ tags:
|
|||
name: Turn
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/URL" />
|
||||
name: URL
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/UnregisterDatasetRequest"
|
||||
/>
|
||||
name: UnregisterDatasetRequest
|
||||
- description: <SchemaDefinition schemaRef="#/components/schemas/UnregisterMemoryBankRequest"
|
||||
/>
|
||||
name: UnregisterMemoryBankRequest
|
||||
|
|
@ -5294,6 +5637,7 @@ x-tagGroups:
|
|||
- AgentTurnResponseTurnCompletePayload
|
||||
- AgentTurnResponseTurnStartPayload
|
||||
- AppEvalTaskConfig
|
||||
- AppendRowsRequest
|
||||
- Attachment
|
||||
- BatchChatCompletionRequest
|
||||
- BatchChatCompletionResponse
|
||||
|
|
@ -5329,6 +5673,7 @@ x-tagGroups:
|
|||
- FinetuningAlgorithm
|
||||
- FunctionCallToolDefinition
|
||||
- GetAgentsSessionRequest
|
||||
- GetSpanTreeRequest
|
||||
- GraphMemoryBank
|
||||
- GraphMemoryBankParams
|
||||
- HealthInfo
|
||||
|
|
@ -5363,8 +5708,12 @@ x-tagGroups:
|
|||
- PreferenceOptimizeRequest
|
||||
- ProviderInfo
|
||||
- QLoraFinetuningConfig
|
||||
- QueryCondition
|
||||
- QueryConditionOp
|
||||
- QueryDocumentsRequest
|
||||
- QueryDocumentsResponse
|
||||
- QuerySpansRequest
|
||||
- QueryTracesRequest
|
||||
- RLHFAlgorithm
|
||||
- RegexParserScoringFnParams
|
||||
- RegisterDatasetRequest
|
||||
|
|
@ -5382,6 +5731,7 @@ x-tagGroups:
|
|||
- SafetyViolation
|
||||
- SamplingParams
|
||||
- SamplingStrategy
|
||||
- SaveSpansToDatasetRequest
|
||||
- ScoreBatchRequest
|
||||
- ScoreBatchResponse
|
||||
- ScoreRequest
|
||||
|
|
@ -5392,9 +5742,11 @@ x-tagGroups:
|
|||
- Session
|
||||
- Shield
|
||||
- ShieldCallStep
|
||||
- Span
|
||||
- SpanEndPayload
|
||||
- SpanStartPayload
|
||||
- SpanStatus
|
||||
- SpanWithChildren
|
||||
- StopReason
|
||||
- StructuredLogEvent
|
||||
- SupervisedFineTuneRequest
|
||||
|
|
@ -5416,6 +5768,7 @@ x-tagGroups:
|
|||
- TrainingConfig
|
||||
- Turn
|
||||
- URL
|
||||
- UnregisterDatasetRequest
|
||||
- UnregisterMemoryBankRequest
|
||||
- UnregisterModelRequest
|
||||
- UnstructuredLogEvent
|
||||
|
|
|
|||
|
|
@ -1,15 +1,418 @@
|
|||
# Building Applications
|
||||
# Building AI Applications
|
||||
|
||||
```{admonition} Work in Progress
|
||||
:class: warning
|
||||
Llama Stack provides all the building blocks needed to create sophisticated AI applications. This guide will walk you through how to use these components effectively.
|
||||
|
||||
## What can you do with the Stack?
|
||||
## Basic Inference
|
||||
|
||||
- Agents
|
||||
- what is a turn? session?
|
||||
- inference
|
||||
- memory / RAG; pre-ingesting content or attaching content in a turn
|
||||
- how does tool calling work
|
||||
- can you do evaluation?
|
||||
The foundation of any AI application is the ability to interact with LLM models. Llama Stack provides a simple interface for both completion and chat-based inference:
|
||||
|
||||
```python
|
||||
from llama_stack_client import LlamaStackClient
|
||||
|
||||
client = LlamaStackClient(base_url="http://localhost:5001")
|
||||
|
||||
# List available models
|
||||
models = client.models.list()
|
||||
|
||||
# Simple chat completion
|
||||
response = client.inference.chat_completion(
|
||||
model_id="Llama3.2-3B-Instruct",
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Write a haiku about coding"}
|
||||
]
|
||||
)
|
||||
print(response.completion_message.content)
|
||||
```
|
||||
|
||||
## Adding Memory & RAG
|
||||
|
||||
Memory enables your applications to reference and recall information from previous interactions or external documents. Llama Stack's memory system is built around the concept of Memory Banks:
|
||||
|
||||
1. **Vector Memory Banks**: For semantic search and retrieval
|
||||
2. **Key-Value Memory Banks**: For structured data storage
|
||||
3. **Keyword Memory Banks**: For basic text search
|
||||
4. **Graph Memory Banks**: For relationship-based retrieval
|
||||
|
||||
Here's how to set up a vector memory bank for RAG:
|
||||
|
||||
```python
|
||||
# Register a memory bank
|
||||
bank_id = "my_documents"
|
||||
response = client.memory_banks.register(
|
||||
memory_bank_id=bank_id,
|
||||
params={
|
||||
"memory_bank_type": "vector",
|
||||
"embedding_model": "all-MiniLM-L6-v2",
|
||||
"chunk_size_in_tokens": 512
|
||||
}
|
||||
)
|
||||
|
||||
# Insert documents
|
||||
documents = [
|
||||
{
|
||||
"document_id": "doc1",
|
||||
"content": "Your document text here",
|
||||
"mime_type": "text/plain"
|
||||
}
|
||||
]
|
||||
client.memory.insert(bank_id, documents)
|
||||
|
||||
# Query documents
|
||||
results = client.memory.query(
|
||||
bank_id=bank_id,
|
||||
query="What do you know about...",
|
||||
)
|
||||
```
|
||||
|
||||
## Implementing Safety Guardrails
|
||||
|
||||
Safety is a critical component of any AI application. Llama Stack provides a Shield system that can be applied at multiple touchpoints:
|
||||
|
||||
```python
|
||||
# Register a safety shield
|
||||
shield_id = "content_safety"
|
||||
client.shields.register(
|
||||
shield_id=shield_id,
|
||||
provider_shield_id="llama-guard-basic"
|
||||
)
|
||||
|
||||
# Run content through shield
|
||||
response = client.safety.run_shield(
|
||||
shield_id=shield_id,
|
||||
messages=[{"role": "user", "content": "User message here"}]
|
||||
)
|
||||
|
||||
if response.violation:
|
||||
print(f"Safety violation detected: {response.violation.user_message}")
|
||||
```
|
||||
|
||||
## Building Agents
|
||||
|
||||
Agents are the heart of complex AI applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks.
|
||||
|
||||
### The Agent Execution Loop
|
||||
|
||||
Each agent turn follows these key steps:
|
||||
|
||||
1. **Initial Safety Check**: The user's input is first screened through configured safety shields
|
||||
|
||||
2. **Context Retrieval**:
|
||||
- If RAG is enabled, the agent queries relevant documents from memory banks
|
||||
- For new documents, they are first inserted into the memory bank
|
||||
- Retrieved context is augmented to the user's prompt
|
||||
|
||||
3. **Inference Loop**: The agent enters its main execution loop:
|
||||
- The LLM receives the augmented prompt (with context and/or previous tool outputs)
|
||||
- The LLM generates a response, potentially with tool calls
|
||||
- If tool calls are present:
|
||||
- Tool inputs are safety-checked
|
||||
- Tools are executed (e.g., web search, code execution)
|
||||
- Tool responses are fed back to the LLM for synthesis
|
||||
- The loop continues until:
|
||||
- The LLM provides a final response without tool calls
|
||||
- Maximum iterations are reached
|
||||
- Token limit is exceeded
|
||||
|
||||
4. **Final Safety Check**: The agent's final response is screened through safety shields
|
||||
|
||||
```{mermaid}
|
||||
sequenceDiagram
|
||||
participant U as User
|
||||
participant E as Executor
|
||||
participant M as Memory Bank
|
||||
participant L as LLM
|
||||
participant T as Tools
|
||||
participant S as Safety Shield
|
||||
|
||||
Note over U,S: Agent Turn Start
|
||||
U->>S: 1. Submit Prompt
|
||||
activate S
|
||||
S->>E: Input Safety Check
|
||||
deactivate S
|
||||
|
||||
E->>M: 2.1 Query Context
|
||||
M-->>E: 2.2 Retrieved Documents
|
||||
|
||||
loop Inference Loop
|
||||
E->>L: 3.1 Augment with Context
|
||||
L-->>E: 3.2 Response (with/without tool calls)
|
||||
|
||||
alt Has Tool Calls
|
||||
E->>S: Check Tool Input
|
||||
S->>T: 4.1 Execute Tool
|
||||
T-->>E: 4.2 Tool Response
|
||||
E->>L: 5.1 Tool Response
|
||||
L-->>E: 5.2 Synthesized Response
|
||||
end
|
||||
|
||||
opt Stop Conditions
|
||||
Note over E: Break if:
|
||||
Note over E: - No tool calls
|
||||
Note over E: - Max iterations reached
|
||||
Note over E: - Token limit exceeded
|
||||
end
|
||||
end
|
||||
|
||||
E->>S: Output Safety Check
|
||||
S->>U: 6. Final Response
|
||||
```
|
||||
|
||||
Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution:
|
||||
|
||||
```python
|
||||
from llama_stack_client.lib.agents.event_logger import EventLogger
|
||||
|
||||
agent_config = AgentConfig(
|
||||
model="Llama3.2-3B-Instruct",
|
||||
instructions="You are a helpful assistant",
|
||||
# Enable both RAG and tool usage
|
||||
tools=[
|
||||
{
|
||||
"type": "memory",
|
||||
"memory_bank_configs": [{
|
||||
"type": "vector",
|
||||
"bank_id": "my_docs"
|
||||
}],
|
||||
"max_tokens_in_context": 4096
|
||||
},
|
||||
{
|
||||
"type": "code_interpreter",
|
||||
"enable_inline_code_execution": True
|
||||
}
|
||||
],
|
||||
# Configure safety
|
||||
input_shields=["content_safety"],
|
||||
output_shields=["content_safety"],
|
||||
# Control the inference loop
|
||||
max_infer_iters=5,
|
||||
sampling_params={
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 2048
|
||||
}
|
||||
)
|
||||
|
||||
agent = Agent(client, agent_config)
|
||||
session_id = agent.create_session("monitored_session")
|
||||
|
||||
# Stream the agent's execution steps
|
||||
response = agent.create_turn(
|
||||
messages=[{"role": "user", "content": "Analyze this code and run it"}],
|
||||
attachments=[{
|
||||
"content": "https://raw.githubusercontent.com/example/code.py",
|
||||
"mime_type": "text/plain"
|
||||
}],
|
||||
session_id=session_id
|
||||
)
|
||||
|
||||
# Monitor each step of execution
|
||||
for log in EventLogger().log(response):
|
||||
if log.event.step_type == "memory_retrieval":
|
||||
print("Retrieved context:", log.event.retrieved_context)
|
||||
elif log.event.step_type == "inference":
|
||||
print("LLM output:", log.event.model_response)
|
||||
elif log.event.step_type == "tool_execution":
|
||||
print("Tool call:", log.event.tool_call)
|
||||
print("Tool response:", log.event.tool_response)
|
||||
elif log.event.step_type == "shield_call":
|
||||
if log.event.violation:
|
||||
print("Safety violation:", log.event.violation)
|
||||
```
|
||||
|
||||
This example shows how an agent can: Llama Stack provides a high-level agent framework:
|
||||
|
||||
```python
|
||||
from llama_stack_client.lib.agents.agent import Agent
|
||||
from llama_stack_client.types.agent_create_params import AgentConfig
|
||||
|
||||
# Configure an agent
|
||||
agent_config = AgentConfig(
|
||||
model="Llama3.2-3B-Instruct",
|
||||
instructions="You are a helpful assistant",
|
||||
tools=[
|
||||
{
|
||||
"type": "memory",
|
||||
"memory_bank_configs": [],
|
||||
"query_generator_config": {
|
||||
"type": "default",
|
||||
"sep": " "
|
||||
}
|
||||
}
|
||||
],
|
||||
input_shields=["content_safety"],
|
||||
output_shields=["content_safety"],
|
||||
enable_session_persistence=True
|
||||
)
|
||||
|
||||
# Create an agent
|
||||
agent = Agent(client, agent_config)
|
||||
session_id = agent.create_session("my_session")
|
||||
|
||||
# Run agent turns
|
||||
response = agent.create_turn(
|
||||
messages=[{"role": "user", "content": "Your question here"}],
|
||||
session_id=session_id
|
||||
)
|
||||
```
|
||||
|
||||
### Adding Tools to Agents
|
||||
|
||||
Agents can be enhanced with various tools:
|
||||
|
||||
1. **Search**: Web search capabilities through providers like Brave
|
||||
2. **Code Interpreter**: Execute code snippets
|
||||
3. **RAG**: Memory and document retrieval
|
||||
4. **Function Calling**: Custom function execution
|
||||
5. **WolframAlpha**: Mathematical computations
|
||||
6. **Photogen**: Image generation
|
||||
|
||||
Example of configuring an agent with tools:
|
||||
|
||||
```python
|
||||
agent_config = AgentConfig(
|
||||
model="Llama3.2-3B-Instruct",
|
||||
tools=[
|
||||
{
|
||||
"type": "brave_search",
|
||||
"api_key": "YOUR_API_KEY",
|
||||
"engine": "brave"
|
||||
},
|
||||
{
|
||||
"type": "code_interpreter",
|
||||
"enable_inline_code_execution": True
|
||||
}
|
||||
],
|
||||
tool_choice="auto",
|
||||
tool_prompt_format="json"
|
||||
)
|
||||
```
|
||||
|
||||
## Building RAG-Enhanced Agents
|
||||
|
||||
One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
|
||||
|
||||
```python
|
||||
from llama_stack_client.types import Attachment
|
||||
|
||||
# Create attachments from documents
|
||||
attachments = [
|
||||
Attachment(
|
||||
content="https://raw.githubusercontent.com/example/doc.rst",
|
||||
mime_type="text/plain"
|
||||
)
|
||||
]
|
||||
|
||||
# Configure agent with memory
|
||||
agent_config = AgentConfig(
|
||||
model="Llama3.2-3B-Instruct",
|
||||
instructions="You are a helpful assistant",
|
||||
tools=[{
|
||||
"type": "memory",
|
||||
"memory_bank_configs": [],
|
||||
"query_generator_config": {"type": "default", "sep": " "},
|
||||
"max_tokens_in_context": 4096,
|
||||
"max_chunks": 10
|
||||
}],
|
||||
enable_session_persistence=True
|
||||
)
|
||||
|
||||
agent = Agent(client, agent_config)
|
||||
session_id = agent.create_session("rag_session")
|
||||
|
||||
# Initial document ingestion
|
||||
response = agent.create_turn(
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": "I am providing some documents for reference."
|
||||
}],
|
||||
attachments=attachments,
|
||||
session_id=session_id
|
||||
)
|
||||
|
||||
# Query with RAG
|
||||
response = agent.create_turn(
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": "What are the key topics in the documents?"
|
||||
}],
|
||||
session_id=session_id
|
||||
)
|
||||
```
|
||||
|
||||
## Testing & Evaluation
|
||||
|
||||
Llama Stack provides built-in tools for evaluating your applications:
|
||||
|
||||
1. **Benchmarking**: Test against standard datasets
|
||||
2. **Application Evaluation**: Score your application's outputs
|
||||
3. **Custom Metrics**: Define your own evaluation criteria
|
||||
|
||||
Here's how to set up basic evaluation:
|
||||
|
||||
```python
|
||||
# Create an evaluation task
|
||||
response = client.eval_tasks.register(
|
||||
eval_task_id="my_eval",
|
||||
dataset_id="my_dataset",
|
||||
scoring_functions=["accuracy", "relevance"]
|
||||
)
|
||||
|
||||
# Run evaluation
|
||||
job = client.eval.run_eval(
|
||||
task_id="my_eval",
|
||||
task_config={
|
||||
"type": "app",
|
||||
"eval_candidate": {
|
||||
"type": "agent",
|
||||
"config": agent_config
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
# Get results
|
||||
result = client.eval.job_result(
|
||||
task_id="my_eval",
|
||||
job_id=job.job_id
|
||||
)
|
||||
```
|
||||
|
||||
## Debugging & Monitoring
|
||||
|
||||
Llama Stack includes comprehensive telemetry for debugging and monitoring your applications:
|
||||
|
||||
1. **Tracing**: Track request flows across components
|
||||
2. **Metrics**: Measure performance and usage
|
||||
3. **Logging**: Debug issues and track behavior
|
||||
|
||||
The telemetry system supports multiple output formats:
|
||||
|
||||
- OpenTelemetry for visualization in tools like Jaeger
|
||||
- SQLite for local storage and querying
|
||||
- Console output for development
|
||||
|
||||
Example of querying traces:
|
||||
|
||||
```python
|
||||
# Query traces for a session
|
||||
traces = client.telemetry.query_traces(
|
||||
attribute_filters=[{
|
||||
"key": "session_id",
|
||||
"op": "eq",
|
||||
"value": session_id
|
||||
}]
|
||||
)
|
||||
|
||||
# Get detailed span information
|
||||
span_tree = client.telemetry.get_span_tree(
|
||||
span_id=traces[0].root_span_id
|
||||
)
|
||||
```
|
||||
|
||||
For details on how to use the telemetry system to debug your applications, export traces to a dataset, and run evaluations, see the [Telemetry](telemetry) section.
|
||||
|
||||
```{toctree}
|
||||
:hidden:
|
||||
:maxdepth: 3
|
||||
|
||||
telemetry
|
||||
```
|
||||
|
|
|
|||
242
docs/source/building_applications/telemetry.md
Normal file
242
docs/source/building_applications/telemetry.md
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
# Telemetry
|
||||
```{note}
|
||||
The telemetry system is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
|
||||
```
|
||||
|
||||
|
||||
|
||||
The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
### Events
|
||||
The telemetry system supports three main types of events:
|
||||
|
||||
- **Unstructured Log Events**: Free-form log messages with severity levels
|
||||
```python
|
||||
unstructured_log_event = UnstructuredLogEvent(
|
||||
message="This is a log message",
|
||||
severity=LogSeverity.INFO
|
||||
)
|
||||
```
|
||||
- **Metric Events**: Numerical measurements with units
|
||||
```python
|
||||
metric_event = MetricEvent(
|
||||
metric="my_metric",
|
||||
value=10,
|
||||
unit="count"
|
||||
)
|
||||
```
|
||||
- **Structured Log Events**: System events like span start/end. Extensible to add more structured log types.
|
||||
```python
|
||||
structured_log_event = SpanStartPayload(
|
||||
name="my_span",
|
||||
parent_span_id="parent_span_id"
|
||||
)
|
||||
```
|
||||
|
||||
### Spans and Traces
|
||||
- **Spans**: Represent operations with timing and hierarchical relationships
|
||||
- **Traces**: Collection of related spans forming a complete request flow
|
||||
|
||||
### Sinks
|
||||
- **OpenTelemetry**: Send events to an OpenTelemetry Collector. This is useful for visualizing traces in a tool like Jaeger.
|
||||
- **SQLite**: Store events in a local SQLite database. This is needed if you want to query the events later through the Llama Stack API.
|
||||
- **Console**: Print events to the console.
|
||||
|
||||
## APIs
|
||||
|
||||
The telemetry API is designed to be flexible for different user flows like debugging/visualization in UI, monitoring, and saving traces to datasets.
|
||||
The telemetry system exposes the following HTTP endpoints:
|
||||
|
||||
### Log Event
|
||||
```http
|
||||
POST /telemetry/log-event
|
||||
```
|
||||
Logs a telemetry event (unstructured log, metric, or structured log) with optional TTL.
|
||||
|
||||
### Query Traces
|
||||
```http
|
||||
POST /telemetry/query-traces
|
||||
```
|
||||
Retrieves traces based on filters with pagination support. Parameters:
|
||||
- `attribute_filters`: List of conditions to filter traces
|
||||
- `limit`: Maximum number of traces to return (default: 100)
|
||||
- `offset`: Number of traces to skip (default: 0)
|
||||
- `order_by`: List of fields to sort by
|
||||
|
||||
### Get Span Tree
|
||||
```http
|
||||
POST /telemetry/get-span-tree
|
||||
```
|
||||
Retrieves a hierarchical view of spans starting from a specific span. Parameters:
|
||||
- `span_id`: ID of the root span to retrieve
|
||||
- `attributes_to_return`: Optional list of specific attributes to include
|
||||
- `max_depth`: Optional maximum depth of the span tree to return
|
||||
|
||||
### Query Spans
|
||||
```http
|
||||
POST /telemetry/query-spans
|
||||
```
|
||||
Retrieves spans matching specified filters and returns selected attributes. Parameters:
|
||||
- `attribute_filters`: List of conditions to filter traces
|
||||
- `attributes_to_return`: List of specific attributes to include in results
|
||||
- `max_depth`: Optional maximum depth of spans to traverse (default: no limit)
|
||||
|
||||
Returns a flattened list of spans with requested attributes.
|
||||
|
||||
### Save Spans to Dataset
|
||||
This is useful for saving traces to a dataset for running evaluations. For example, you can save the input/output of each span that is part of an agent session/turn to a dataset and then run an eval task on it. See example in [Example: Save Spans to Dataset](#example-save-spans-to-dataset).
|
||||
```http
|
||||
POST /telemetry/save-spans-to-dataset
|
||||
```
|
||||
Queries spans and saves their attributes to a dataset. Parameters:
|
||||
- `attribute_filters`: List of conditions to filter traces
|
||||
- `attributes_to_save`: List of span attributes to save to the dataset
|
||||
- `dataset_id`: ID of the dataset to save to
|
||||
- `max_depth`: Optional maximum depth of spans to traverse (default: no limit)
|
||||
|
||||
## Providers
|
||||
|
||||
### Meta-Reference Provider
|
||||
Currently, only the meta-reference provider is implemented. It can be configured to send events to three sink types:
|
||||
1) OpenTelemetry Collector
|
||||
2) SQLite
|
||||
3) Console
|
||||
|
||||
## Configuration
|
||||
|
||||
Here's an example that sends telemetry signals to all three sink types. Your configuration might use only one.
|
||||
```yaml
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
sinks: ['console', 'sqlite', 'otel']
|
||||
otel_endpoint: "http://localhost:4318/v1/traces"
|
||||
sqlite_db_path: "/path/to/telemetry.db"
|
||||
```
|
||||
|
||||
## Jaeger to visualize traces
|
||||
|
||||
The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data.
|
||||
|
||||
Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:
|
||||
|
||||
```bash
|
||||
$ docker run --rm --name jaeger \
|
||||
-p 16686:16686 -p 4318:4318 \
|
||||
jaegertracing/jaeger:2.1.0
|
||||
```
|
||||
|
||||
Once the Jaeger instance is running, you can visualize traces by navigating to http://localhost:16686/.
|
||||
|
||||
## Querying Traces Stored in SQLIte
|
||||
|
||||
The `sqlite` sink allows you to query traces without an external system. Here are some example queries:
|
||||
|
||||
Querying Traces for a agent session
|
||||
The client SDK is not updated to support the new telemetry API. It will be updated soon. You can manually query traces using the following curl command:
|
||||
|
||||
``` bash
|
||||
curl -X POST 'http://localhost:5000/alpha/telemetry/query-traces' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"attribute_filters": [
|
||||
{
|
||||
"key": "session_id",
|
||||
"op": "eq",
|
||||
"value": "dd667b87-ca4b-4d30-9265-5a0de318fc65" }],
|
||||
"limit": 100,
|
||||
"offset": 0,
|
||||
"order_by": ["start_time"]
|
||||
|
||||
[
|
||||
{
|
||||
"trace_id": "6902f54b83b4b48be18a6f422b13e16f",
|
||||
"root_span_id": "5f37b85543afc15a",
|
||||
"start_time": "2024-12-04T08:08:30.501587",
|
||||
"end_time": "2024-12-04T08:08:36.026463"
|
||||
},
|
||||
........
|
||||
]
|
||||
}'
|
||||
|
||||
```
|
||||
|
||||
Querying spans for a specifc root span id
|
||||
|
||||
``` bash
|
||||
curl -X POST 'http://localhost:5000/alpha/telemetry/get-span-tree' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{ "span_id" : "6cceb4b48a156913", "max_depth": 2 }'
|
||||
|
||||
{
|
||||
"span_id": "6cceb4b48a156913",
|
||||
"trace_id": "dafa796f6aaf925f511c04cd7c67fdda",
|
||||
"parent_span_id": "892a66d726c7f990",
|
||||
"name": "retrieve_rag_context",
|
||||
"start_time": "2024-12-04T09:28:21.781995",
|
||||
"end_time": "2024-12-04T09:28:21.913352",
|
||||
"attributes": {
|
||||
"input": [
|
||||
"{\"role\":\"system\",\"content\":\"You are a helpful assistant\"}",
|
||||
"{\"role\":\"user\",\"content\":\"What are the top 5 topics that were explained in the documentation? Only list succinct bullet points.\",\"context\":null}"
|
||||
]
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"span_id": "1a2df181854064a8",
|
||||
"trace_id": "dafa796f6aaf925f511c04cd7c67fdda",
|
||||
"parent_span_id": "6cceb4b48a156913",
|
||||
"name": "MemoryRouter.query_documents",
|
||||
"start_time": "2024-12-04T09:28:21.787620",
|
||||
"end_time": "2024-12-04T09:28:21.906512",
|
||||
"attributes": {
|
||||
"input": null
|
||||
},
|
||||
"children": [],
|
||||
"status": "ok"
|
||||
}
|
||||
],
|
||||
"status": "ok"
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Example: Save Spans to Dataset
|
||||
Save all spans for a specific agent session to a dataset.
|
||||
``` bash
|
||||
curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"attribute_filters": [
|
||||
{
|
||||
"key": "session_id",
|
||||
"op": "eq",
|
||||
"value": "dd667b87-ca4b-4d30-9265-5a0de318fc65"
|
||||
}
|
||||
],
|
||||
"attributes_to_save": ["input", "output"],
|
||||
"dataset_id": "my_dataset",
|
||||
"max_depth": 10
|
||||
}'
|
||||
```
|
||||
|
||||
Save all spans for a specific agent turn to a dataset.
|
||||
```bash
|
||||
curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"attribute_filters": [
|
||||
{
|
||||
"key": "turn_id",
|
||||
"op": "eq",
|
||||
"value": "123e4567-e89b-12d3-a456-426614174000"
|
||||
}
|
||||
],
|
||||
"attributes_to_save": ["input", "output"],
|
||||
"dataset_id": "my_dataset",
|
||||
"max_depth": 10
|
||||
}'
|
||||
```
|
||||
|
|
@ -28,6 +28,7 @@ extensions = [
|
|||
"sphinx_tabs.tabs",
|
||||
"sphinx_design",
|
||||
"sphinxcontrib.redoc",
|
||||
"sphinxcontrib.mermaid",
|
||||
]
|
||||
myst_enable_extensions = ["colon_fence"]
|
||||
|
||||
|
|
@ -47,6 +48,7 @@ exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
|||
myst_enable_extensions = [
|
||||
"amsmath",
|
||||
"attrs_inline",
|
||||
"attrs_block",
|
||||
"colon_fence",
|
||||
"deflist",
|
||||
"dollarmath",
|
||||
|
|
@ -65,6 +67,7 @@ myst_substitutions = {
|
|||
"docker_hub": "https://hub.docker.com/repository/docker/llamastack",
|
||||
}
|
||||
|
||||
|
||||
# Copy button settings
|
||||
copybutton_prompt_text = "$ " # for bash prompts
|
||||
copybutton_prompt_is_regexp = True
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ This guide contains references to walk you through adding a new API provider.
|
|||
- {repopath}`Remote Providers::llama_stack/providers/remote`
|
||||
- {repopath}`Inline Providers::llama_stack/providers/inline`
|
||||
|
||||
3. [Build a Llama Stack distribution](https://llama-stack.readthedocs.io/en/latest/distribution_dev/building_distro.html) with your API provider.
|
||||
3. [Build a Llama Stack distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html) with your API provider.
|
||||
4. Test your code!
|
||||
|
||||
## Testing your newly added API providers
|
||||
|
|
|
|||
|
|
@ -66,121 +66,247 @@ llama stack build --list-templates
|
|||
```
|
||||
|
||||
```
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| Template Name | Providers | Description |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| hf-serverless | { | Like local, but use Hugging Face Inference API (serverless) for running LLM |
|
||||
| | "inference": "remote::hf::serverless", | inference. |
|
||||
| | "memory": "meta-reference", | See https://hf.co/docs/api-inference. |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| together | { | Use Together.ai for running LLM inference |
|
||||
| | "inference": "remote::together", | |
|
||||
| | "memory": [ | |
|
||||
| | "meta-reference", | |
|
||||
| | "remote::weaviate" | |
|
||||
| | ], | |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| fireworks | { | Use Fireworks.ai for running LLM inference |
|
||||
| | "inference": "remote::fireworks", | |
|
||||
| | "memory": [ | |
|
||||
| | "meta-reference", | |
|
||||
| | "remote::weaviate", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| databricks | { | Use Databricks for running LLM inference |
|
||||
| | "inference": "remote::databricks", | |
|
||||
| | "memory": "meta-reference", | |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| vllm | { | Like local, but use vLLM for running LLM inference |
|
||||
| | "inference": "vllm", | |
|
||||
| | "memory": "meta-reference", | |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| tgi | { | Use TGI for running LLM inference |
|
||||
| | "inference": "remote::tgi", | |
|
||||
| | "memory": [ | |
|
||||
| | "meta-reference", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| bedrock | { | Use Amazon Bedrock APIs. |
|
||||
| | "inference": "remote::bedrock", | |
|
||||
| | "memory": "meta-reference", | |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| meta-reference-gpu | { | Use code from `llama_stack` itself to serve all llama stack APIs |
|
||||
| | "inference": "meta-reference", | |
|
||||
| | "memory": [ | |
|
||||
| | "meta-reference", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| meta-reference-quantized-gpu | { | Use code from `llama_stack` itself to serve all llama stack APIs |
|
||||
| | "inference": "meta-reference-quantized", | |
|
||||
| | "memory": [ | |
|
||||
| | "meta-reference", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| ollama | { | Use ollama for running LLM inference |
|
||||
| | "inference": "remote::ollama", | |
|
||||
| | "memory": [ | |
|
||||
| | "meta-reference", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
| hf-endpoint | { | Like local, but use Hugging Face Inference Endpoints for running LLM inference. |
|
||||
| | "inference": "remote::hf::endpoint", | See https://hf.co/docs/api-endpoints. |
|
||||
| | "memory": "meta-reference", | |
|
||||
| | "safety": "meta-reference", | |
|
||||
| | "agents": "meta-reference", | |
|
||||
| | "telemetry": "meta-reference" | |
|
||||
| | } | |
|
||||
+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| Template Name | Providers | Description |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| tgi | { | Use (an external) TGI server for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "remote::tgi" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| remote-vllm | { | Use (an external) vLLM server for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "remote::vllm" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| vllm-gpu | { | Use a built-in vLLM engine for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "inline::vllm" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| meta-reference-quantized-gpu | { | Use Meta Reference with fp8, int4 quantization for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "inline::meta-reference-quantized" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| meta-reference-gpu | { | Use Meta Reference for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| hf-serverless | { | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "remote::hf::serverless" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| together | { | Use Together.AI for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "remote::together" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| ollama | { | Use (an external) Ollama server for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "remote::ollama" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| bedrock | { | Use AWS Bedrock for running LLM inference and safety |
|
||||
| | "inference": [ | |
|
||||
| | "remote::bedrock" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "remote::bedrock" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| hf-endpoint | { | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "remote::hf::endpoint" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| fireworks | { | Use Fireworks.AI for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "remote::fireworks" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::faiss", | |
|
||||
| | "remote::chromadb", | |
|
||||
| | "remote::pgvector" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
| cerebras | { | Use Cerebras for running LLM inference |
|
||||
| | "inference": [ | |
|
||||
| | "remote::cerebras" | |
|
||||
| | ], | |
|
||||
| | "safety": [ | |
|
||||
| | "inline::llama-guard" | |
|
||||
| | ], | |
|
||||
| | "memory": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "agents": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ], | |
|
||||
| | "telemetry": [ | |
|
||||
| | "inline::meta-reference" | |
|
||||
| | ] | |
|
||||
| | } | |
|
||||
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
|
||||
```
|
||||
|
||||
You may then pick a template to build your distribution with providers fitted to your liking.
|
||||
|
|
|
|||
|
|
@ -81,6 +81,8 @@ A few things to note:
|
|||
- The configuration dictionary is provider-specific. Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value.
|
||||
|
||||
## Resources
|
||||
```
|
||||
|
||||
Finally, let's look at the `models` section:
|
||||
```yaml
|
||||
models:
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ print(response)
|
|||
```python
|
||||
response = await client.inference.chat_completion(
|
||||
messages=[UserMessage(content="What is the capital of France?", role="user")],
|
||||
model="Llama3.1-8B-Instruct",
|
||||
model_id="Llama3.1-8B-Instruct",
|
||||
stream=False,
|
||||
)
|
||||
print("\nChat completion response:")
|
||||
|
|
|
|||
|
|
@ -35,6 +35,6 @@ If so, we suggest:
|
|||
|
||||
- **Do you want to run Llama Stack inference on your iOS / Android device** If so, we suggest:
|
||||
- [iOS SDK](ondevice_distro/ios_sdk)
|
||||
- Android (coming soon)
|
||||
- [Android](ondevice_distro/android_sdk)
|
||||
|
||||
You can also build your own [custom distribution](building_distro).
|
||||
|
|
|
|||
247
docs/source/distributions/ondevice_distro/android_sdk.md
Normal file
247
docs/source/distributions/ondevice_distro/android_sdk.md
Normal file
|
|
@ -0,0 +1,247 @@
|
|||
# Llama Stack Client Kotlin API Library
|
||||
|
||||
We are excited to share a guide for a Kotlin Library that brings front the benefits of Llama Stack to your Android device. This library is a set of SDKs that provide a simple and effective way to integrate AI capabilities into your Android app whether it is local (on-device) or remote inference.
|
||||
|
||||
Features:
|
||||
- Local Inferencing: Run Llama models purely on-device with real-time processing. We currently utilize ExecuTorch as the local inference distributor and may support others in the future.
|
||||
- [ExecuTorch](https://github.com/pytorch/executorch/tree/main) is a complete end-to-end solution within the PyTorch framework for inferencing capabilities on-device with high portability and seamless performance.
|
||||
- Remote Inferencing: Perform inferencing tasks remotely with Llama models hosted on a remote connection (or serverless localhost).
|
||||
- Simple Integration: With easy-to-use APIs, a developer can quickly integrate Llama Stack in their Android app. The difference with local vs remote inferencing is also minimal.
|
||||
|
||||
Latest Release Notes: [v0.0.54.1](https://github.com/meta-llama/llama-stack-client-kotlin/releases/tag/v0.0.54.1)
|
||||
|
||||
## Android Demo App
|
||||
Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app)
|
||||
|
||||
The key files in the app are `LlamaStackLocalInference.kt`, `LlamaStackRemoteInference.kts`, and `MainActivity.java`. With encompassed business logic, the app shows how to use Llama Stack for both the environments.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Add Dependencies
|
||||
#### Kotlin Library
|
||||
Add the following dependency in your `build.gradle.kts` file:
|
||||
```
|
||||
dependencies {
|
||||
implementation("com.llama.llamastack:llama-stack-client-kotlin:0.0.54.1")
|
||||
}
|
||||
```
|
||||
This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
|
||||
|
||||
If you plan on doing remote inferencing this is sufficient to get started.
|
||||
|
||||
#### Dependency for Local
|
||||
|
||||
For local inferencing, it is required to include the ExecuTorch library into your app.
|
||||
|
||||
Include the ExecuTorch library by:
|
||||
1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/blob/release/0.0.54.1/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
|
||||
2. Move the script to the top level of your Android app where the app directory resides:
|
||||
<p align="center">
|
||||
<img src="https://raw.githubusercontent.com/meta-llama/llama-stack-client-kotlin/refs/heads/release/0.0.54.1/doc/img/example_android_app_directory.png" style="width:300px">
|
||||
</p>
|
||||
|
||||
3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate with commit: [0a12e33](https://github.com/pytorch/executorch/commit/0a12e33d22a3d44d1aa2af5f0d0673d45b962553).
|
||||
4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
|
||||
```
|
||||
dependencies {
|
||||
...
|
||||
implementation(files("libs/executorch.aar"))
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
## Llama Stack APIs in Your Android App
|
||||
Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library.
|
||||
|
||||
### Setup Remote Inferencing
|
||||
Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
|
||||
```
|
||||
conda create -n stack-fireworks python=3.10
|
||||
conda activate stack-fireworks
|
||||
pip install llama-stack=0.0.54
|
||||
llama stack build --template fireworks --image-type conda
|
||||
export FIREWORKS_API_KEY=<SOME_KEY>
|
||||
llama stack run /Users/<your_username>/.llama/distributions/llamastack-fireworks/fireworks-run.yaml --port=5050
|
||||
```
|
||||
|
||||
Other inference providers: [Table](https://llama-stack.readthedocs.io/en/latest/index.html#supported-llama-stack-implementations)
|
||||
|
||||
How to set remote localhost in Demo App: [Settings](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app#settings)
|
||||
|
||||
### Initialize the Client
|
||||
A client serves as the primary interface for interacting with a specific inference type and its associated parameters. Only after client is initialized then you can configure and start inferences.
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Local Inference</th>
|
||||
<th>Remote Inference</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
|
||||
```
|
||||
client = LlamaStackClientLocalClient
|
||||
.builder()
|
||||
.modelPath(modelPath)
|
||||
.tokenizerPath(tokenizerPath)
|
||||
.temperature(temperature)
|
||||
.build()
|
||||
```
|
||||
</td>
|
||||
<td>
|
||||
|
||||
```
|
||||
// remoteURL is a string like "http://localhost:5050"
|
||||
client = LlamaStackClientOkHttpClient
|
||||
.builder()
|
||||
.baseUrl(remoteURL)
|
||||
.build()
|
||||
```
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
### Run Inference
|
||||
With the Kotlin Library managing all the major operational logic, there are minimal to no changes when running simple chat inference for local or remote:
|
||||
|
||||
```
|
||||
val result = client!!.inference().chatCompletion(
|
||||
InferenceChatCompletionParams.builder()
|
||||
.modelId(modelName)
|
||||
.putAdditionalQueryParam("seq_len", sequenceLength.toString())
|
||||
.messages(listOfMessages)
|
||||
.build()
|
||||
)
|
||||
|
||||
// response contains string with response from model
|
||||
var response = result.asChatCompletionResponse().completionMessage().content().string();
|
||||
```
|
||||
|
||||
### Setup Tool Calling
|
||||
|
||||
Android demo app for more details: [Tool Calling](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app#tool-calling)
|
||||
|
||||
## Advanced Users
|
||||
|
||||
The purpose of this section is to share more details with users that would like to dive deeper into the Llama Stack Kotlin Library. Whether you’re interested in contributing to the open source library, debugging or just want to learn more, this section is for you!
|
||||
|
||||
### Prerequisite
|
||||
|
||||
You must complete the following steps:
|
||||
1. Clone the repo (`git clone https://github.com/meta-llama/llama-stack-client-kotlin.git -b release/0.0.54.1`)
|
||||
2. Port the appropriate ExecuTorch libraries over into your Llama Stack Kotlin library environment.
|
||||
```
|
||||
cd llama-stack-client-kotlin-client-local
|
||||
sh download-prebuilt-et-lib.sh --unzip
|
||||
```
|
||||
|
||||
Now you will notice that the `jni/` , `libs/`, and `AndroidManifest.xml` files from the `executorch.aar` file are present in the local module. This way the local client module will be able to realize the ExecuTorch SDK.
|
||||
|
||||
### Building for Development/Debugging
|
||||
If you’d like to contribute to the Kotlin library via development, debug, or add play around with the library with various print statements, run the following command in your terminal under the llama-stack-client-kotlin directory.
|
||||
|
||||
```
|
||||
sh build-libs.sh
|
||||
```
|
||||
|
||||
Output: .jar files located in the build-jars directory
|
||||
|
||||
Copy the .jar files over to the lib directory in your Android app. At the same time make sure to remove the llama-stack-client-kotlin dependency within your build.gradle.kts file in your app (or if you are using the demo app) to avoid having multiple llama stack client dependencies.
|
||||
|
||||
### Additional Options for Local Inferencing
|
||||
Currently we provide additional properties support with local inferencing. In order to get the tokens/sec metric for each inference call, add the following code in your Android app after you run your chatCompletion inference function. The Reference app has this implementation as well:
|
||||
```
|
||||
var tps = (result.asChatCompletionResponse()._additionalProperties()["tps"] as JsonNumber).value as Float
|
||||
```
|
||||
We will be adding more properties in the future.
|
||||
|
||||
### Additional Options for Remote Inferencing
|
||||
|
||||
#### Network options
|
||||
|
||||
##### Retries
|
||||
|
||||
Requests that experience certain errors are automatically retried 2 times by default, with a short exponential backoff. Connection errors (for example, due to a network connectivity problem), 408 Request Timeout, 409 Conflict, 429 Rate Limit, and >=500 Internal errors will all be retried by default.
|
||||
You can provide a `maxRetries` on the client builder to configure this:
|
||||
|
||||
```kotlin
|
||||
val client = LlamaStackClientOkHttpClient.builder()
|
||||
.fromEnv()
|
||||
.maxRetries(4)
|
||||
.build()
|
||||
```
|
||||
|
||||
##### Timeouts
|
||||
|
||||
Requests time out after 1 minute by default. You can configure this on the client builder:
|
||||
|
||||
```kotlin
|
||||
val client = LlamaStackClientOkHttpClient.builder()
|
||||
.fromEnv()
|
||||
.timeout(Duration.ofSeconds(30))
|
||||
.build()
|
||||
```
|
||||
|
||||
##### Proxies
|
||||
|
||||
Requests can be routed through a proxy. You can configure this on the client builder:
|
||||
|
||||
```kotlin
|
||||
val client = LlamaStackClientOkHttpClient.builder()
|
||||
.fromEnv()
|
||||
.proxy(new Proxy(
|
||||
Type.HTTP,
|
||||
new InetSocketAddress("proxy.com", 8080)
|
||||
))
|
||||
.build()
|
||||
```
|
||||
|
||||
##### Environments
|
||||
|
||||
Requests are made to the production environment by default. You can connect to other environments, like `sandbox`, via the client builder:
|
||||
|
||||
```kotlin
|
||||
val client = LlamaStackClientOkHttpClient.builder()
|
||||
.fromEnv()
|
||||
.sandbox()
|
||||
.build()
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
This library throws exceptions in a single hierarchy for easy handling:
|
||||
|
||||
- **`LlamaStackClientException`** - Base exception for all exceptions
|
||||
|
||||
- **`LlamaStackClientServiceException`** - HTTP errors with a well-formed response body we were able to parse. The exception message and the `.debuggingRequestId()` will be set by the server.
|
||||
|
||||
| 400 | BadRequestException |
|
||||
| ------ | ----------------------------- |
|
||||
| 401 | AuthenticationException |
|
||||
| 403 | PermissionDeniedException |
|
||||
| 404 | NotFoundException |
|
||||
| 422 | UnprocessableEntityException |
|
||||
| 429 | RateLimitException |
|
||||
| 5xx | InternalServerException |
|
||||
| others | UnexpectedStatusCodeException |
|
||||
|
||||
- **`LlamaStackClientIoException`** - I/O networking errors
|
||||
- **`LlamaStackClientInvalidDataException`** - any other exceptions on the client side, e.g.:
|
||||
- We failed to serialize the request body
|
||||
- We failed to parse the response body (has access to response code and body)
|
||||
|
||||
## Reporting Issues
|
||||
If you encountered any bugs or issues following this guide please file a bug/issue on our [Github issue tracker](https://github.com/meta-llama/llama-stack-client-kotlin/issues).
|
||||
|
||||
## Known Issues
|
||||
We're aware of the following issues and are working to resolve them:
|
||||
1. Streaming response is a work-in-progress for local and remote inference
|
||||
2. Due to #1, agents are not supported at the time. LS agents only work in streaming mode
|
||||
3. Changing to another model is a work in progress for local and remote platforms
|
||||
|
||||
## Thanks
|
||||
We'd like to extend our thanks to the ExecuTorch team for providing their support as we integrated ExecuTorch as one of the local inference distributors for Llama Stack. Checkout [ExecuTorch Github repo](https://github.com/pytorch/executorch/tree/main) for more information.
|
||||
|
||||
---
|
||||
|
||||
The API interface is generated using the OpenAPI standard with [Stainless](https://www.stainlessapi.com/).
|
||||
|
|
@ -1,6 +1,3 @@
|
|||
---
|
||||
orphan: true
|
||||
---
|
||||
# Bedrock Distribution
|
||||
|
||||
```{toctree}
|
||||
|
|
@ -15,9 +12,12 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro
|
|||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::bedrock` |
|
||||
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
| safety | `remote::bedrock` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
|
||||
|
|
|
|||
61
docs/source/distributions/self_hosted_distro/cerebras.md
Normal file
61
docs/source/distributions/self_hosted_distro/cerebras.md
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
# Cerebras Distribution
|
||||
|
||||
The `llamastack/distribution-cerebras` distribution consists of the following provider configurations.
|
||||
|
||||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| inference | `remote::cerebras` |
|
||||
| memory | `inline::meta-reference` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
||||
- `CEREBRAS_API_KEY`: Cerebras API Key (default: ``)
|
||||
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
- `meta-llama/Llama-3.1-8B-Instruct (llama3.1-8b)`
|
||||
- `meta-llama/Llama-3.1-70B-Instruct (llama3.1-70b)`
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
|
||||
Make sure you have access to a Cerebras API Key. You can get one by visiting [cloud.cerebras.ai](https://cloud.cerebras.ai/).
|
||||
|
||||
|
||||
## Running Llama Stack with Cerebras
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-cerebras \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
|
||||
```
|
||||
|
||||
### Via Conda
|
||||
|
||||
```bash
|
||||
llama stack build --template cerebras --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
|
||||
```
|
||||
|
|
@ -15,9 +15,12 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
|
|||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::fireworks` |
|
||||
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -15,9 +15,12 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
|
|||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `inline::meta-reference` |
|
||||
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
|
||||
|
|
@ -36,7 +39,7 @@ The following environment variables can be configured:
|
|||
|
||||
## Prerequisite: Downloading Models
|
||||
|
||||
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
|
||||
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
|
||||
|
||||
```
|
||||
$ ls ~/.llama/checkpoints
|
||||
|
|
@ -57,6 +60,7 @@ LLAMA_STACK_PORT=5001
|
|||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-meta-reference-gpu \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
|
|
@ -68,6 +72,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
|||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-meta-reference-gpu \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||
|
|
|
|||
|
|
@ -15,9 +15,12 @@ The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists
|
|||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `inline::meta-reference-quantized` |
|
||||
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
|
||||
|
|
@ -36,7 +39,7 @@ The following environment variables can be configured:
|
|||
|
||||
## Prerequisite: Downloading Models
|
||||
|
||||
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
|
||||
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
|
||||
|
||||
```
|
||||
$ ls ~/.llama/checkpoints
|
||||
|
|
@ -57,6 +60,7 @@ LLAMA_STACK_PORT=5001
|
|||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-meta-reference-quantized-gpu \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
|
|
@ -68,6 +72,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
|||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-meta-reference-quantized-gpu \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||
|
|
|
|||
|
|
@ -15,9 +15,12 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
|
|||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::ollama` |
|
||||
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
|
||||
|
|
@ -118,9 +121,9 @@ llama stack run ./run-with-safety.yaml \
|
|||
|
||||
### (Optional) Update Model Serving Configuration
|
||||
|
||||
> [!NOTE]
|
||||
> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
|
||||
|
||||
```{note}
|
||||
Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models.
|
||||
```
|
||||
|
||||
To serve a new model with `ollama`
|
||||
```bash
|
||||
|
|
|
|||
|
|
@ -16,9 +16,12 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
|
|||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::tgi` |
|
||||
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -15,9 +15,12 @@ The `llamastack/distribution-together` distribution consists of the following pr
|
|||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::together` |
|
||||
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -19,16 +19,17 @@ export LLAMA_STACK_PORT=5001
|
|||
ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
|
||||
```
|
||||
|
||||
By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to enspagents/agenure the model remains loaded for sometime.
|
||||
By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime.
|
||||
|
||||
|
||||
### 2. Start the Llama Stack server
|
||||
|
||||
Llama Stack is based on a client-server architecture. It consists of a server which can be configured very flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Memory, Agents, Telemetry, Evals and so forth.
|
||||
|
||||
To get started quickly, we provide various Docker images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the Docker image.
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
docker run -it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-ollama \
|
||||
|
|
@ -42,8 +43,7 @@ Configuration for this is available at `distributions/ollama/run.yaml`.
|
|||
|
||||
### 3. Use the Llama Stack client SDK
|
||||
|
||||
You can interact with the Llama Stack server using the `llama-stack-client` CLI or via the Python SDK.
|
||||
|
||||
You can interact with the Llama Stack server using various client SDKs. We will use the Python SDK which you can install using:
|
||||
```bash
|
||||
pip install llama-stack-client
|
||||
```
|
||||
|
|
@ -62,7 +62,7 @@ llama-stack-client --endpoint http://localhost:$LLAMA_STACK_PORT models list
|
|||
You can test basic Llama inference completion using the CLI too.
|
||||
```bash
|
||||
llama-stack-client --endpoint http://localhost:$LLAMA_STACK_PORT \
|
||||
inference chat_completion \
|
||||
inference chat-completion \
|
||||
--message "hello, what model are you?"
|
||||
```
|
||||
|
||||
|
|
@ -118,11 +118,11 @@ async def run_main():
|
|||
model=os.environ["INFERENCE_MODEL"],
|
||||
instructions="You are a helpful assistant",
|
||||
tools=[{"type": "memory"}], # enable Memory aka RAG
|
||||
enable_session_persistence=True,
|
||||
)
|
||||
|
||||
agent = Agent(client, agent_config)
|
||||
session_id = agent.create_session("test-session")
|
||||
print(f"Created session_id={session_id} for Agent({agent.agent_id})")
|
||||
user_prompts = [
|
||||
(
|
||||
"I am attaching documentation for Torchtune. Help me answer questions I will ask next.",
|
||||
|
|
@ -139,7 +139,7 @@ async def run_main():
|
|||
attachments=attachments,
|
||||
session_id=session_id,
|
||||
)
|
||||
async for log in EventLogger().log(response):
|
||||
for log in EventLogger().log(response):
|
||||
log.print()
|
||||
|
||||
|
||||
|
|
@ -153,3 +153,10 @@ if __name__ == "__main__":
|
|||
- Learn how to [Build Llama Stacks](../distributions/index.md)
|
||||
- See [References](../references/index.md) for more details about the llama CLI and Python SDK
|
||||
- For example applications and more detailed tutorials, visit our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository.
|
||||
|
||||
|
||||
## Thinking out aloud here in terms of what to write in the docs
|
||||
|
||||
- how to get a llama stack server running
|
||||
- what are all the different client sdks
|
||||
- what are the components of building agents
|
||||
|
|
|
|||
|
|
@ -13,38 +13,32 @@ Our goal is to provide pre-packaged implementations which can be operated in a v
|
|||
The Stack APIs are rapidly improving but still a work-in-progress. We invite feedback as well as direct contributions.
|
||||
```
|
||||
|
||||
## Philosophy
|
||||
## Quick Links
|
||||
|
||||
### Service-oriented design
|
||||
- New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision.
|
||||
- Ready to build? Check out the [Quick Start](getting_started/index) to get started.
|
||||
- Need specific providers? Browse [Distributions](distributions/index) to see all the options available.
|
||||
- Want to contribute? See the [Contributing](contributing/index) guide.
|
||||
|
||||
Unlike other frameworks, Llama Stack is built with a service-oriented, REST API-first approach. Such a design not only allows for seamless transitions from a local to remote deployments, but also forces the design to be more declarative. We believe this restriction can result in a much simpler, robust developer experience. This will necessarily trade-off against expressivity however if we get the APIs right, it can lead to a very powerful platform.
|
||||
## Available SDKs
|
||||
|
||||
### Composability
|
||||
|
||||
We expect the set of APIs we design to be composable. An Agent abstractly depends on { Inference, Memory, Safety } APIs but does not care about the actual implementation details. Safety itself may require model inference and hence can depend on the Inference API.
|
||||
|
||||
### Turnkey one-stop solutions
|
||||
|
||||
We expect to provide turnkey solutions for popular deployment scenarios. It should be easy to deploy a Llama Stack server on AWS or on a private data center. Either of these should allow a developer to get started with powerful agentic apps, model evaluations or fine-tuning services in a matter of minutes. They should all result in the same uniform observability and developer experience.
|
||||
|
||||
### Focus on Llama models
|
||||
|
||||
As a Meta initiated project, we have started by explicitly focusing on Meta's Llama series of models. Supporting the broad set of open models is no easy task and we want to start with models we understand best.
|
||||
|
||||
### Supporting the Ecosystem
|
||||
|
||||
There is a vibrant ecosystem of Providers which provide efficient inference or scalable vector stores or powerful observability solutions. We want to make sure it is easy for developers to pick and choose the best implementations for their use cases. We also want to make sure it is easy for new Providers to onboard and participate in the ecosystem.
|
||||
|
||||
Additionally, we have designed every element of the Stack such that APIs as well as Resources (like Models) can be federated.
|
||||
We have a number of client-side SDKs available for different languages.
|
||||
|
||||
| **Language** | **Client SDK** | **Package** |
|
||||
| :----: | :----: | :----: |
|
||||
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [](https://pypi.org/project/llama_stack_client/)
|
||||
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
|
||||
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [](https://npmjs.org/package/llama-stack-client)
|
||||
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
|
||||
|
||||
## Supported Llama Stack Implementations
|
||||
|
||||
Llama Stack already has a number of "adapters" available for some popular Inference and Memory (Vector Store) providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs.
|
||||
A number of "adapters" are available for some popular Inference and Memory (Vector Store) providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs.
|
||||
|
||||
| **API Provider** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
|
||||
| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
|
||||
| Meta Reference | Single Node | Y | Y | Y | Y | Y |
|
||||
| Cerebras | Single Node | | Y | | | |
|
||||
| Fireworks | Hosted | Y | Y | Y | | |
|
||||
| AWS Bedrock | Hosted | | Y | | Y | |
|
||||
| Together | Hosted | Y | Y | | Y | |
|
||||
|
|
@ -54,29 +48,13 @@ Llama Stack already has a number of "adapters" available for some popular Infere
|
|||
| Chroma | Single Node | | | Y | | |
|
||||
| Postgres | Single Node | | | Y | | |
|
||||
| PyTorch ExecuTorch | On-device iOS | Y | Y | | |
|
||||
|
||||
## Dive In
|
||||
|
||||
- Look at [Quick Start](getting_started/index) section to get started with Llama Stack.
|
||||
- Learn more about [Llama Stack Concepts](concepts/index) to understand how different components fit together.
|
||||
- Check out [Zero to Hero](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) guide to learn in details about how to build your first agent.
|
||||
- See how you can use [Llama Stack Distributions](distributions/index) to get started with popular inference and other service providers.
|
||||
|
||||
We also provide a number of Client side SDKs to make it easier to connect to Llama Stack server in your preferred language.
|
||||
|
||||
| **Language** | **Client SDK** | **Package** |
|
||||
| :----: | :----: | :----: |
|
||||
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [](https://pypi.org/project/llama_stack_client/)
|
||||
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
|
||||
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [](https://npmjs.org/package/llama-stack-client)
|
||||
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
|
||||
|
||||
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
|
||||
| PyTorch ExecuTorch | On-device Android | | Y | | |
|
||||
|
||||
```{toctree}
|
||||
:hidden:
|
||||
:maxdepth: 3
|
||||
|
||||
introduction/index
|
||||
getting_started/index
|
||||
concepts/index
|
||||
distributions/index
|
||||
|
|
|
|||
95
docs/source/introduction/index.md
Normal file
95
docs/source/introduction/index.md
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
# Why Llama Stack?
|
||||
|
||||
Building production AI applications today requires solving multiple challenges:
|
||||
|
||||
**Infrastructure Complexity**
|
||||
- Running large language models efficiently requires specialized infrastructure.
|
||||
- Different deployment scenarios (local development, cloud, edge) need different solutions.
|
||||
- Moving from development to production often requires significant rework.
|
||||
|
||||
**Essential Capabilities**
|
||||
- Safety guardrails and content filtering are necessary in an enterprise setting.
|
||||
- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
|
||||
- Nearly any application needs composable multi-step workflows.
|
||||
- Finally, without monitoring, observability and evaluation, you end up operating in the dark.
|
||||
|
||||
**Lack of Flexibility and Choice**
|
||||
- Directly integrating with multiple providers creates tight coupling.
|
||||
- Different providers have different APIs and abstractions.
|
||||
- Changing providers requires significant code changes.
|
||||
|
||||
|
||||
### The Vision: A Universal Stack
|
||||
|
||||
|
||||
```{image} ../../_static/llama-stack.png
|
||||
:alt: Llama Stack
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. These building blocks are presented as interoperable APIs with a broad set of Service Providers providing their implementations.
|
||||
|
||||
#### Service-oriented Design
|
||||
Unlike other frameworks, Llama Stack is built with a service-oriented, REST API-first approach. Such a design not only allows for seamless transitions from local to remote deployments but also forces the design to be more declarative. This restriction can result in a much simpler, robust developer experience. The same code works across different environments:
|
||||
|
||||
- Local development with CPU-only setups
|
||||
- Self-hosted with GPU acceleration
|
||||
- Cloud-hosted on providers like AWS, Fireworks, Together
|
||||
- On-device for iOS and Android
|
||||
|
||||
|
||||
#### Composability
|
||||
The APIs we design are composable. An Agent abstractly depends on { Inference, Memory, Safety } APIs but does not care about the actual implementation details. Safety itself may require model inference and hence can depend on the Inference API.
|
||||
|
||||
#### Turnkey Solutions
|
||||
|
||||
We provide turnkey solutions for popular deployment scenarios. It should be easy to deploy a Llama Stack server on AWS or in a private data center. Either of these should allow a developer to get started with powerful agentic apps, model evaluations, or fine-tuning services in minutes.
|
||||
|
||||
We have built-in support for critical needs:
|
||||
|
||||
- Safety guardrails and content filtering
|
||||
- Comprehensive evaluation capabilities
|
||||
- Full observability and monitoring
|
||||
- Provider federation and fallback
|
||||
|
||||
#### Focus on Llama Models
|
||||
As a Meta-initiated project, we explicitly focus on Meta's Llama series of models. Supporting the broad set of open models is no easy task and we want to start with models we understand best.
|
||||
|
||||
#### Supporting the Ecosystem
|
||||
There is a vibrant ecosystem of Providers which provide efficient inference or scalable vector stores or powerful observability solutions. We want to make sure it is easy for developers to pick and choose the best implementations for their use cases. We also want to make sure it is easy for new Providers to onboard and participate in the ecosystem.
|
||||
|
||||
Additionally, we have designed every element of the Stack such that APIs as well as Resources (like Models) can be federated.
|
||||
|
||||
#### Rich Provider Ecosystem
|
||||
|
||||
```{list-table}
|
||||
:header-rows: 1
|
||||
|
||||
* - Provider
|
||||
- Local
|
||||
- Self-hosted
|
||||
- Cloud
|
||||
* - Inference
|
||||
- Ollama
|
||||
- vLLM, TGI
|
||||
- Fireworks, Together, AWS
|
||||
* - Memory
|
||||
- FAISS
|
||||
- Chroma, pgvector
|
||||
- Weaviate
|
||||
* - Safety
|
||||
- Llama Guard
|
||||
- -
|
||||
- AWS Bedrock
|
||||
```
|
||||
|
||||
|
||||
### Unified API Layer
|
||||
|
||||
Llama Stack provides a consistent interface for:
|
||||
|
||||
- **Inference**: Run LLM models efficiently
|
||||
- **Safety**: Apply content filtering and safety policies
|
||||
- **Memory**: Store and retrieve knowledge for RAG
|
||||
- **Agents**: Build multi-step workflows
|
||||
- **Evaluation**: Test and improve application quality
|
||||
|
|
@ -27,8 +27,6 @@ $ llama-stack-client configure
|
|||
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000
|
||||
```
|
||||
|
||||
## Provider Commands
|
||||
|
||||
### `llama-stack-client providers list`
|
||||
```bash
|
||||
$ llama-stack-client providers list
|
||||
|
|
@ -119,8 +117,25 @@ $ llama-stack-client memory_banks list
|
|||
+--------------+----------------+--------+-------------------+------------------------+--------------------------+
|
||||
```
|
||||
|
||||
## Shield Management
|
||||
### `llama-stack-client memory_banks register`
|
||||
```bash
|
||||
$ llama-stack-client memory_banks register <memory-bank-id> --type <type> [--provider-id <provider-id>] [--provider-memory-bank-id <provider-memory-bank-id>] [--chunk-size <chunk-size>] [--embedding-model <embedding-model>] [--overlap-size <overlap-size>]
|
||||
```
|
||||
|
||||
Options:
|
||||
- `--type`: Required. Type of memory bank. Choices: "vector", "keyvalue", "keyword", "graph"
|
||||
- `--provider-id`: Optional. Provider ID for the memory bank
|
||||
- `--provider-memory-bank-id`: Optional. Provider's memory bank ID
|
||||
- `--chunk-size`: Optional. Chunk size in tokens (for vector type). Default: 512
|
||||
- `--embedding-model`: Optional. Embedding model (for vector type). Default: "all-MiniLM-L6-v2"
|
||||
- `--overlap-size`: Optional. Overlap size in tokens (for vector type). Default: 64
|
||||
|
||||
### `llama-stack-client memory_banks unregister`
|
||||
```bash
|
||||
$ llama-stack-client memory_banks unregister <memory-bank-id>
|
||||
```
|
||||
|
||||
## Shield Management
|
||||
### `llama-stack-client shields list`
|
||||
```bash
|
||||
$ llama-stack-client shields list
|
||||
|
|
@ -134,16 +149,51 @@ $ llama-stack-client shields list
|
|||
+--------------+----------+----------------+-------------+
|
||||
```
|
||||
|
||||
## Evaluation Tasks
|
||||
### `llama-stack-client shields register`
|
||||
```bash
|
||||
$ llama-stack-client shields register --shield-id <shield-id> [--provider-id <provider-id>] [--provider-shield-id <provider-shield-id>] [--params <params>]
|
||||
```
|
||||
|
||||
Options:
|
||||
- `--shield-id`: Required. ID of the shield
|
||||
- `--provider-id`: Optional. Provider ID for the shield
|
||||
- `--provider-shield-id`: Optional. Provider's shield ID
|
||||
- `--params`: Optional. JSON configuration parameters for the shield
|
||||
|
||||
## Eval Task Management
|
||||
|
||||
### `llama-stack-client eval_tasks list`
|
||||
```bash
|
||||
$ llama-stack-client eval run_benchmark <task_id1> <task_id2> --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json
|
||||
$ llama-stack-client eval_tasks list
|
||||
```
|
||||
|
||||
where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config
|
||||
### `llama-stack-client eval_tasks register`
|
||||
```bash
|
||||
$ llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
|
||||
```
|
||||
$ cat ~/eval_task_config.json
|
||||
|
||||
Options:
|
||||
- `--eval-task-id`: Required. ID of the eval task
|
||||
- `--dataset-id`: Required. ID of the dataset to evaluate
|
||||
- `--scoring-functions`: Required. One or more scoring functions to use for evaluation
|
||||
- `--provider-id`: Optional. Provider ID for the eval task
|
||||
- `--provider-eval-task-id`: Optional. Provider's eval task ID
|
||||
- `--metadata`: Optional. Metadata for the eval task in JSON format
|
||||
|
||||
## Eval execution
|
||||
### `llama-stack-client eval run-benchmark`
|
||||
```bash
|
||||
$ llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
|
||||
```
|
||||
|
||||
Options:
|
||||
- `--eval-task-config`: Required. Path to the eval task config file in JSON format
|
||||
- `--output-dir`: Required. Path to the directory where evaluation results will be saved
|
||||
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
|
||||
- `--visualize`: Optional flag. If set, visualizes evaluation results after completion
|
||||
|
||||
Example eval_task_config.json:
|
||||
```json
|
||||
{
|
||||
"type": "benchmark",
|
||||
"eval_candidate": {
|
||||
|
|
@ -160,3 +210,14 @@ $ cat ~/eval_task_config.json
|
|||
}
|
||||
}
|
||||
```
|
||||
|
||||
### `llama-stack-client eval run-scoring`
|
||||
```bash
|
||||
$ llama-stack-client eval run-scoring <eval-task-id> --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
|
||||
```
|
||||
|
||||
Options:
|
||||
- `--eval-task-config`: Required. Path to the eval task config file in JSON format
|
||||
- `--output-dir`: Required. Path to the directory where scoring results will be saved
|
||||
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
|
||||
- `--visualize`: Optional flag. If set, visualizes scoring results after completion
|
||||
|
|
|
|||
|
|
@ -13,13 +13,13 @@ Based on your developer needs, below are references to guides to help you get st
|
|||
* Developer Need: I want to start a local Llama Stack server with my GPU using meta-reference implementations.
|
||||
* Effort: 5min
|
||||
* Guide:
|
||||
- Please see our [meta-reference-gpu](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/meta-reference-gpu.html) on starting up a meta-reference Llama Stack server.
|
||||
- Please see our [meta-reference-gpu](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) on starting up a meta-reference Llama Stack server.
|
||||
|
||||
### Llama Stack Server with Remote Providers
|
||||
* Developer need: I want a Llama Stack distribution with a remote provider.
|
||||
* Effort: 10min
|
||||
* Guide
|
||||
- Please see our [Distributions Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/index.html) on starting up distributions with remote providers.
|
||||
- Please see our [Distributions Guide](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#distributions) on starting up distributions with remote providers.
|
||||
|
||||
|
||||
### On-Device (iOS) Llama Stack
|
||||
|
|
@ -38,4 +38,4 @@ Based on your developer needs, below are references to guides to help you get st
|
|||
* Developer Need: I want to add a new API provider to Llama Stack.
|
||||
* Effort: 3hr
|
||||
* Guide
|
||||
- Please see our [Adding a New API Provider](https://llama-stack.readthedocs.io/en/latest/api_providers/new_api_provider.html) guide for adding a new API provider.
|
||||
- Please see our [Adding a New API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) guide for adding a new API provider.
|
||||
|
|
|
|||
|
|
@ -231,7 +231,7 @@
|
|||
"source": [
|
||||
"Thanks for checking out this notebook! \n",
|
||||
"\n",
|
||||
"The next one will be a guide on [Prompt Engineering](./01_Prompt_Engineering101.ipynb), please continue learning!"
|
||||
"The next one will be a guide on [Prompt Engineering](./02_Prompt_Engineering101.ipynb), please continue learning!"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
|||
|
|
@ -276,7 +276,7 @@
|
|||
"source": [
|
||||
"Thanks for checking out this notebook! \n",
|
||||
"\n",
|
||||
"The next one will be a guide on how to chat with images, continue to the notebook [here](./02_Image_Chat101.ipynb). Happy learning!"
|
||||
"The next one will be a guide on how to chat with images, continue to the notebook [here](./03_Image_Chat101.ipynb). Happy learning!"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
|||
|
|
@ -175,7 +175,7 @@
|
|||
"source": [
|
||||
"Thanks for checking out this notebook! \n",
|
||||
"\n",
|
||||
"The next one in the series will teach you one of the favorite applications of Large Language Models: [Tool Calling](./03_Tool_Calling101.ipynb). Enjoy!"
|
||||
"The next one in the series will teach you one of the favorite applications of Large Language Models: [Tool Calling](./04_Tool_Calling101.ipynb). Enjoy!"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
|||
|
|
@ -286,6 +286,9 @@
|
|||
" input_shields = [] if disable_safety else [\"llama_guard\"]\n",
|
||||
" output_shields = [] if disable_safety else [\"llama_guard\"]\n",
|
||||
"\n",
|
||||
" # Initialize custom tool (ensure `WebSearchTool` is defined earlier in the notebook)\n",
|
||||
" webSearchTool = WebSearchTool(api_key=BRAVE_SEARCH_API_KEY)\n",
|
||||
" \n",
|
||||
" # Define the agent configuration, including the model and tool setup\n",
|
||||
" agent_config = AgentConfig(\n",
|
||||
" model=MODEL_NAME,\n",
|
||||
|
|
@ -296,18 +299,7 @@
|
|||
" \"top_p\": 0.9,\n",
|
||||
" },\n",
|
||||
" tools=[\n",
|
||||
" {\n",
|
||||
" \"function_name\": \"web_search\", # Name of the tool being integrated\n",
|
||||
" \"description\": \"Search the web for a given query\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"query\": {\n",
|
||||
" \"param_type\": \"str\",\n",
|
||||
" \"description\": \"The query to search for\",\n",
|
||||
" \"required\": True,\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"type\": \"function_call\",\n",
|
||||
" },\n",
|
||||
" webSearchTool.get_tool_definition()\n",
|
||||
" ],\n",
|
||||
" tool_choice=\"auto\",\n",
|
||||
" tool_prompt_format=\"python_list\",\n",
|
||||
|
|
@ -316,11 +308,8 @@
|
|||
" enable_session_persistence=False,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Initialize custom tools (ensure `WebSearchTool` is defined earlier in the notebook)\n",
|
||||
" custom_tools = [WebSearchTool(api_key=BRAVE_SEARCH_API_KEY)]\n",
|
||||
"\n",
|
||||
" # Create an agent instance with the client and configuration\n",
|
||||
" agent = Agent(client, agent_config, custom_tools)\n",
|
||||
" agent = Agent(client, agent_config, [webSearchTool])\n",
|
||||
"\n",
|
||||
" # Create a session for interaction and print the session ID\n",
|
||||
" session_id = agent.create_session(\"test-session\")\n",
|
||||
|
|
|
|||
|
|
@ -373,7 +373,7 @@
|
|||
"source": [
|
||||
"Awesome, now we can embed all our notes with Llama-stack and ask it about the meaning of life :)\n",
|
||||
"\n",
|
||||
"Next up, we will learn about the safety features and how to use them: [notebook link](./05_Safety101.ipynb)"
|
||||
"Next up, we will learn about the safety features and how to use them: [notebook link](./06_Safety101.ipynb)."
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
|||
|
|
@ -107,7 +107,7 @@
|
|||
"source": [
|
||||
"Thanks for leaning about the Safety API of Llama-Stack. \n",
|
||||
"\n",
|
||||
"Finally, we learn about the Agents API, [here](./06_Agents101.ipynb)"
|
||||
"Finally, we learn about the Agents API, [here](./07_Agents101.ipynb)."
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
|||
|
|
@ -1,37 +1,21 @@
|
|||
# Llama Stack: from Zero to Hero
|
||||
|
||||
Llama-Stack allows you to configure your distribution from various providers, allowing you to focus on going from zero to production super fast.
|
||||
Llama Stack defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Providers providing their implementations. These building blocks are assembled into Distributions which are easy for developers to get from zero to production.
|
||||
|
||||
This guide will walk you through how to build a local distribution, using Ollama as an inference provider.
|
||||
This guide will walk you through an end-to-end workflow with Llama Stack with Ollama as the inference provider and ChromaDB as the memory provider. Please note the steps for configuring your provider and distribution will vary a little depending on the services you use. However, the user experience will remain universal - this is the power of Llama-Stack.
|
||||
|
||||
We also have a set of notebooks walking you through how to use Llama-Stack APIs:
|
||||
If you're looking for more specific topics, we have a [Zero to Hero Guide](#next-steps) that covers everything from Tool Calling to Agents in detail. Feel free to skip to the end to explore the advanced topics you're interested in.
|
||||
|
||||
- Inference
|
||||
- Prompt Engineering
|
||||
- Chatting with Images
|
||||
- Tool Calling
|
||||
- Memory API for RAG
|
||||
- Safety API
|
||||
- Agentic API
|
||||
|
||||
Below, we will learn how to get started with Ollama as an inference provider, please note the steps for configuring your provider will vary a little depending on the service. However, the user experience will remain universal-this is the power of Llama-Stack.
|
||||
|
||||
Prototype locally using Ollama, deploy to the cloud with your favorite provider or own deployment. Use any API from any provider while focussing on development.
|
||||
|
||||
# Ollama Quickstart Guide
|
||||
|
||||
This guide will walk you through setting up an end-to-end workflow with Llama Stack with ollama, enabling you to perform text generation using the `Llama3.2-3B-Instruct` model. Follow these steps to get started quickly.
|
||||
|
||||
If you're looking for more specific topics like tool calling or agent setup, we have a [Zero to Hero Guide](#next-steps) that covers everything from Tool Calling to Agents in detail. Feel free to skip to the end to explore the advanced topics you're interested in.
|
||||
|
||||
> If you'd prefer not to set up a local server, explore our notebook on [tool calling with the Together API](Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb). This guide will show you how to leverage Together.ai's Llama Stack Server API, allowing you to get started with Llama Stack without the need for a locally built and running server.
|
||||
> If you'd prefer not to set up a local server, explore our notebook on [tool calling with the Together API](Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb). This notebook will show you how to leverage together.ai's Llama Stack Server API, allowing you to get started with Llama Stack without the need for a locally built and running server.
|
||||
|
||||
## Table of Contents
|
||||
1. [Setup ollama](#setup-ollama)
|
||||
1. [Setup and run ollama](#setup-ollama)
|
||||
2. [Install Dependencies and Set Up Environment](#install-dependencies-and-set-up-environment)
|
||||
3. [Build, Configure, and Run Llama Stack](#build-configure-and-run-llama-stack)
|
||||
4. [Run Ollama Model](#run-ollama-model)
|
||||
5. [Next Steps](#next-steps)
|
||||
4. [Test with llama-stack-client CLI](#test-with-llama-stack-client-cli)
|
||||
5. [Test with curl](#test-with-curl)
|
||||
6. [Test with Python](#test-with-python)
|
||||
7. [Next Steps](#next-steps)
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -39,107 +23,137 @@ If you're looking for more specific topics like tool calling or agent setup, we
|
|||
|
||||
1. **Download Ollama App**:
|
||||
- Go to [https://ollama.com/download](https://ollama.com/download).
|
||||
- Download and unzip `Ollama-darwin.zip`.
|
||||
- Follow instructions based on the OS you are on. For example, if you are on a Mac, download and unzip `Ollama-darwin.zip`.
|
||||
- Run the `Ollama` application.
|
||||
|
||||
1. **Download the Ollama CLI**:
|
||||
- Ensure you have the `ollama` command line tool by downloading and installing it from the same website.
|
||||
Ensure you have the `ollama` command line tool by downloading and installing it from the same website.
|
||||
|
||||
1. **Start ollama server**:
|
||||
- Open the terminal and run:
|
||||
```
|
||||
ollama serve
|
||||
```
|
||||
|
||||
Open the terminal and run:
|
||||
```
|
||||
ollama serve
|
||||
```
|
||||
1. **Run the model**:
|
||||
- Open the terminal and run:
|
||||
```bash
|
||||
ollama run llama3.2:3b-instruct-fp16
|
||||
```
|
||||
**Note**: The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
|
||||
|
||||
Open the terminal and run:
|
||||
```bash
|
||||
ollama run llama3.2:3b-instruct-fp16 --keepalive -1m
|
||||
```
|
||||
**Note**:
|
||||
- The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
|
||||
- `keepalive -1m` is used so that ollama continues to keep the model in memory indefinitely. Otherwise, ollama frees up memory and you would have to run `ollama run` again.
|
||||
|
||||
---
|
||||
|
||||
## Install Dependencies and Set Up Environment
|
||||
|
||||
1. **Create a Conda Environment**:
|
||||
- Create a new Conda environment with Python 3.10:
|
||||
```bash
|
||||
conda create -n ollama python=3.10
|
||||
```
|
||||
- Activate the environment:
|
||||
```bash
|
||||
conda activate ollama
|
||||
```
|
||||
Create a new Conda environment with Python 3.10:
|
||||
```bash
|
||||
conda create -n ollama python=3.10
|
||||
```
|
||||
Activate the environment:
|
||||
```bash
|
||||
conda activate ollama
|
||||
```
|
||||
|
||||
2. **Install ChromaDB**:
|
||||
- Install `chromadb` using `pip`:
|
||||
```bash
|
||||
pip install chromadb
|
||||
```
|
||||
Install `chromadb` using `pip`:
|
||||
```bash
|
||||
pip install chromadb
|
||||
```
|
||||
|
||||
3. **Run ChromaDB**:
|
||||
- Start the ChromaDB server:
|
||||
```bash
|
||||
chroma run --host localhost --port 8000 --path ./my_chroma_data
|
||||
```
|
||||
Start the ChromaDB server:
|
||||
```bash
|
||||
chroma run --host localhost --port 8000 --path ./my_chroma_data
|
||||
```
|
||||
|
||||
4. **Install Llama Stack**:
|
||||
- Open a new terminal and install `llama-stack`:
|
||||
```bash
|
||||
conda activate hack
|
||||
pip install llama-stack==0.0.53
|
||||
```
|
||||
Open a new terminal and install `llama-stack`:
|
||||
```bash
|
||||
conda activate ollama
|
||||
pip install llama-stack==0.0.55
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Build, Configure, and Run Llama Stack
|
||||
|
||||
1. **Build the Llama Stack**:
|
||||
- Build the Llama Stack using the `ollama` template:
|
||||
```bash
|
||||
llama stack build --template ollama --image-type conda
|
||||
```
|
||||
|
||||
After this step, you will see the console output:
|
||||
|
||||
```
|
||||
Build Successful! Next steps:
|
||||
Build the Llama Stack using the `ollama` template:
|
||||
```bash
|
||||
llama stack build --template ollama --image-type conda
|
||||
```
|
||||
**Expected Output:**
|
||||
```
|
||||
...
|
||||
Build Successful! Next steps:
|
||||
1. Set the environment variables: LLAMASTACK_PORT, OLLAMA_URL, INFERENCE_MODEL, SAFETY_MODEL
|
||||
2. `llama stack run /Users/username/.llama/distributions/llamastack-ollama/ollama-run.yaml`
|
||||
```
|
||||
2. `llama stack run /Users/<username>/.llama/distributions/llamastack-ollama/ollama-run.yaml
|
||||
```
|
||||
|
||||
2. **Set the ENV variables by exporting them to the terminal**:
|
||||
```bash
|
||||
export OLLAMA_URL="http://localhost:11434"
|
||||
export LLAMA_STACK_PORT=5001
|
||||
export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
|
||||
export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
|
||||
```
|
||||
3. **Set the ENV variables by exporting them to the terminal**:
|
||||
```bash
|
||||
export OLLAMA_URL="http://localhost:11434"
|
||||
export LLAMA_STACK_PORT=5051
|
||||
export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
|
||||
export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
|
||||
```
|
||||
|
||||
3. **Run the Llama Stack**:
|
||||
- Run the stack with command shared by the API from earlier:
|
||||
```bash
|
||||
llama stack run ollama \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env OLLAMA_URL=http://localhost:11434
|
||||
```
|
||||
|
||||
Note: Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model
|
||||
Run the stack with command shared by the API from earlier:
|
||||
```bash
|
||||
llama stack run ollama \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env OLLAMA_URL=$OLLAMA_URL
|
||||
```
|
||||
Note: Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.
|
||||
|
||||
The server will start and listen on `http://localhost:5051`.
|
||||
|
||||
---
|
||||
## Test with `llama-stack-client` CLI
|
||||
After setting up the server, open a new terminal window and install the llama-stack-client package.
|
||||
|
||||
## Testing with `curl`
|
||||
1. Install the llama-stack-client package
|
||||
```bash
|
||||
conda activate ollama
|
||||
pip install llama-stack-client
|
||||
```
|
||||
2. Configure the CLI to point to the llama-stack server.
|
||||
```bash
|
||||
llama-stack-client configure --endpoint http://localhost:5051
|
||||
```
|
||||
**Expected Output:**
|
||||
```bash
|
||||
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5051
|
||||
```
|
||||
3. Test the CLI by running inference:
|
||||
```bash
|
||||
llama-stack-client inference chat-completion --message "Write me a 2-sentence poem about the moon"
|
||||
```
|
||||
**Expected Output:**
|
||||
```bash
|
||||
ChatCompletionResponse(
|
||||
completion_message=CompletionMessage(
|
||||
content='Here is a 2-sentence poem about the moon:\n\nSilver crescent shining bright in the night,\nA beacon of wonder, full of gentle light.',
|
||||
role='assistant',
|
||||
stop_reason='end_of_turn',
|
||||
tool_calls=[]
|
||||
),
|
||||
logprobs=None
|
||||
)
|
||||
```
|
||||
|
||||
## Test with `curl`
|
||||
|
||||
After setting up the server, open a new terminal window and verify it's working by sending a `POST` request using `curl`:
|
||||
|
||||
```bash
|
||||
curl http://localhost:5051/inference/chat_completion \
|
||||
curl http://localhost:$LLAMA_STACK_PORT/inference/chat_completion \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "Llama3.2-3B-Instruct",
|
||||
|
|
@ -168,15 +182,16 @@ You can check the available models with the command `llama-stack-client models l
|
|||
|
||||
---
|
||||
|
||||
## Testing with Python
|
||||
## Test with Python
|
||||
|
||||
You can also interact with the Llama Stack server using a simple Python script. Below is an example:
|
||||
|
||||
### 1. Active Conda Environment and Install Required Python Packages
|
||||
### 1. Activate Conda Environment and Install Required Python Packages
|
||||
The `llama-stack-client` library offers a robust and efficient python methods for interacting with the Llama Stack server.
|
||||
|
||||
```bash
|
||||
conda activate your-llama-stack-conda-env
|
||||
conda activate ollama
|
||||
pip install llama-stack-client
|
||||
```
|
||||
|
||||
Note, the client library gets installed by default if you install the server library
|
||||
|
|
@ -188,6 +203,8 @@ touch test_llama_stack.py
|
|||
|
||||
### 3. Create a Chat Completion Request in Python
|
||||
|
||||
In `test_llama_stack.py`, write the following code:
|
||||
|
||||
```python
|
||||
from llama_stack_client import LlamaStackClient
|
||||
|
||||
|
|
@ -227,15 +244,15 @@ This command initializes the model to interact with your local Llama Stack insta
|
|||
## Next Steps
|
||||
|
||||
**Explore Other Guides**: Dive deeper into specific topics by following these guides:
|
||||
- [Understanding Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html#decide-your-inference-provider)
|
||||
- [Understanding Distribution](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#distributions)
|
||||
- [Inference 101](00_Inference101.ipynb)
|
||||
- [Local and Cloud Model Toggling 101](00_Local_Cloud_Inference101.ipynb)
|
||||
- [Prompt Engineering](01_Prompt_Engineering101.ipynb)
|
||||
- [Chat with Image - LlamaStack Vision API](02_Image_Chat101.ipynb)
|
||||
- [Tool Calling: How to and Details](03_Tool_Calling101.ipynb)
|
||||
- [Memory API: Show Simple In-Memory Retrieval](04_Memory101.ipynb)
|
||||
- [Using Safety API in Conversation](05_Safety101.ipynb)
|
||||
- [Agents API: Explain Components](06_Agents101.ipynb)
|
||||
- [Local and Cloud Model Toggling 101](01_Local_Cloud_Inference101.ipynb)
|
||||
- [Prompt Engineering](02_Prompt_Engineering101.ipynb)
|
||||
- [Chat with Image - LlamaStack Vision API](03_Image_Chat101.ipynb)
|
||||
- [Tool Calling: How to and Details](04_Tool_Calling101.ipynb)
|
||||
- [Memory API: Show Simple In-Memory Retrieval](05_Memory101.ipynb)
|
||||
- [Using Safety API in Conversation](06_Safety101.ipynb)
|
||||
- [Agents API: Explain Components](07_Agents101.ipynb)
|
||||
|
||||
|
||||
**Explore Client SDKs**: Utilize our client SDKs for various languages to integrate Llama Stack into your applications:
|
||||
|
|
@ -244,7 +261,7 @@ This command initializes the model to interact with your local Llama Stack insta
|
|||
- [Swift SDK](https://github.com/meta-llama/llama-stack-client-swift)
|
||||
- [Kotlin SDK](https://github.com/meta-llama/llama-stack-client-kotlin)
|
||||
|
||||
**Advanced Configuration**: Learn how to customize your Llama Stack distribution by referring to the [Building a Llama Stack Distribution](https://llama-stack.readthedocs.io/en/latest/distributions/index.html#building-your-own-distribution) guide.
|
||||
**Advanced Configuration**: Learn how to customize your Llama Stack distribution by referring to the [Building a Llama Stack Distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html) guide.
|
||||
|
||||
**Explore Example Apps**: Check out [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) for example applications built using Llama Stack.
|
||||
|
||||
|
|
|
|||
|
|
@ -71,7 +71,8 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install llama-stack-client"
|
||||
"!pip install llama-stack-client==0.0.50\n",
|
||||
"!pip install -U httpx==0.27.2 # https://github.com/meta-llama/llama-stack-apps/issues/131"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -355,6 +356,9 @@
|
|||
"async def create_weather_agent(client: LlamaStackClient) -> Agent:\n",
|
||||
" \"\"\"Create an agent with weather tool capability.\"\"\"\n",
|
||||
"\n",
|
||||
" # Create the agent with the tool\n",
|
||||
" weather_tool = WeatherTool()\n",
|
||||
" \n",
|
||||
" agent_config = AgentConfig(\n",
|
||||
" model=LLAMA31_8B_INSTRUCT,\n",
|
||||
" #model=model_name,\n",
|
||||
|
|
@ -369,23 +373,7 @@
|
|||
" \"top_p\": 0.9,\n",
|
||||
" },\n",
|
||||
" tools=[\n",
|
||||
" {\n",
|
||||
" \"function_name\": \"get_weather\",\n",
|
||||
" \"description\": \"Get weather information for a location\",\n",
|
||||
" \"parameters\": {\n",
|
||||
" \"location\": {\n",
|
||||
" \"param_type\": \"str\",\n",
|
||||
" \"description\": \"City or location name\",\n",
|
||||
" \"required\": True,\n",
|
||||
" },\n",
|
||||
" \"date\": {\n",
|
||||
" \"param_type\": \"str\",\n",
|
||||
" \"description\": \"Optional date (YYYY-MM-DD)\",\n",
|
||||
" \"required\": False,\n",
|
||||
" },\n",
|
||||
" },\n",
|
||||
" \"type\": \"function_call\",\n",
|
||||
" }\n",
|
||||
" weather_tool.get_tool_definition()\n",
|
||||
" ],\n",
|
||||
" tool_choice=\"auto\",\n",
|
||||
" tool_prompt_format=\"json\",\n",
|
||||
|
|
@ -394,8 +382,6 @@
|
|||
" enable_session_persistence=True\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Create the agent with the tool\n",
|
||||
" weather_tool = WeatherTool()\n",
|
||||
" agent = Agent(\n",
|
||||
" client=client,\n",
|
||||
" agent_config=agent_config,\n",
|
||||
|
|
@ -470,5 +456,5 @@
|
|||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue