mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-23 03:22:26 +00:00
Merge branch 'main' of https://github.com/anigasan/llama-stack
Integrating the Tavily Search functionality to be better integrated with the Llama 4 API
This commit is contained in:
commit
b8846f65e6
108 changed files with 4132 additions and 3288 deletions
24
docs/_static/llama-stack-spec.html
vendored
24
docs/_static/llama-stack-spec.html
vendored
|
|
@ -11340,6 +11340,9 @@
|
|||
},
|
||||
"embedding_dimension": {
|
||||
"type": "integer"
|
||||
},
|
||||
"vector_db_name": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
@ -13590,10 +13593,6 @@
|
|||
"provider_id": {
|
||||
"type": "string",
|
||||
"description": "The ID of the provider to use for this vector store."
|
||||
},
|
||||
"provider_vector_db_id": {
|
||||
"type": "string",
|
||||
"description": "The provider-specific vector database ID."
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
|
@ -14796,7 +14795,8 @@
|
|||
"description": "Template for formatting each retrieved chunk in the context. Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict). Default: \"Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n\""
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"$ref": "#/components/schemas/RAGSearchMode",
|
||||
"default": "vector",
|
||||
"description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"."
|
||||
},
|
||||
"ranker": {
|
||||
|
|
@ -14831,6 +14831,16 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"RAGSearchMode": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"vector",
|
||||
"keyword",
|
||||
"hybrid"
|
||||
],
|
||||
"title": "RAGSearchMode",
|
||||
"description": "Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search for semantic matching - KEYWORD: Uses keyword-based search for exact matching - HYBRID: Combines both vector and keyword search for better results"
|
||||
},
|
||||
"RRFRanker": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -15623,6 +15633,10 @@
|
|||
"type": "string",
|
||||
"description": "The identifier of the provider."
|
||||
},
|
||||
"vector_db_name": {
|
||||
"type": "string",
|
||||
"description": "The name of the vector database."
|
||||
},
|
||||
"provider_vector_db_id": {
|
||||
"type": "string",
|
||||
"description": "The identifier of the vector database in the provider."
|
||||
|
|
|
|||
23
docs/_static/llama-stack-spec.yaml
vendored
23
docs/_static/llama-stack-spec.yaml
vendored
|
|
@ -7984,6 +7984,8 @@ components:
|
|||
type: string
|
||||
embedding_dimension:
|
||||
type: integer
|
||||
vector_db_name:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
required:
|
||||
- identifier
|
||||
|
|
@ -9494,10 +9496,6 @@ components:
|
|||
type: string
|
||||
description: >-
|
||||
The ID of the provider to use for this vector store.
|
||||
provider_vector_db_id:
|
||||
type: string
|
||||
description: >-
|
||||
The provider-specific vector database ID.
|
||||
additionalProperties: false
|
||||
required:
|
||||
- name
|
||||
|
|
@ -10346,7 +10344,8 @@ components:
|
|||
content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
|
||||
{chunk.content}\nMetadata: {metadata}\n"
|
||||
mode:
|
||||
type: string
|
||||
$ref: '#/components/schemas/RAGSearchMode'
|
||||
default: vector
|
||||
description: >-
|
||||
Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
|
||||
"vector".
|
||||
|
|
@ -10373,6 +10372,17 @@ components:
|
|||
mapping:
|
||||
default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
|
||||
llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
|
||||
RAGSearchMode:
|
||||
type: string
|
||||
enum:
|
||||
- vector
|
||||
- keyword
|
||||
- hybrid
|
||||
title: RAGSearchMode
|
||||
description: >-
|
||||
Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search
|
||||
for semantic matching - KEYWORD: Uses keyword-based search for exact matching
|
||||
- HYBRID: Combines both vector and keyword search for better results
|
||||
RRFRanker:
|
||||
type: object
|
||||
properties:
|
||||
|
|
@ -10933,6 +10943,9 @@ components:
|
|||
provider_id:
|
||||
type: string
|
||||
description: The identifier of the provider.
|
||||
vector_db_name:
|
||||
type: string
|
||||
description: The name of the vector database.
|
||||
provider_vector_db_id:
|
||||
type: string
|
||||
description: >-
|
||||
|
|
|
|||
6
docs/source/advanced_apis/eval/index.md
Normal file
6
docs/source/advanced_apis/eval/index.md
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
# Eval Providers
|
||||
|
||||
This section contains documentation for all available providers for the **eval** API.
|
||||
|
||||
- [inline::meta-reference](inline_meta-reference.md)
|
||||
- [remote::nvidia](remote_nvidia.md)
|
||||
21
docs/source/advanced_apis/eval/inline_meta-reference.md
Normal file
21
docs/source/advanced_apis/eval/inline_meta-reference.md
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
# inline::meta-reference
|
||||
|
||||
## Description
|
||||
|
||||
Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
kvstore:
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
|
||||
|
||||
```
|
||||
|
||||
19
docs/source/advanced_apis/eval/remote_nvidia.md
Normal file
19
docs/source/advanced_apis/eval/remote_nvidia.md
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# remote::nvidia
|
||||
|
||||
## Description
|
||||
|
||||
NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `evaluator_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
|
||||
|
||||
```
|
||||
|
||||
33
docs/source/advanced_apis/index.md
Normal file
33
docs/source/advanced_apis/index.md
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
# Advanced APIs
|
||||
|
||||
## Post-training
|
||||
Fine-tunes a model.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
post_training/index
|
||||
```
|
||||
|
||||
## Eval
|
||||
Generates outputs (via Inference or Agents) and perform scoring.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
eval/index
|
||||
```
|
||||
|
||||
```{include} evaluation_concepts.md
|
||||
:start-after: ## Evaluation Concepts
|
||||
```
|
||||
|
||||
## Scoring
|
||||
Evaluates the outputs of the system.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
scoring/index
|
||||
```
|
||||
|
||||
7
docs/source/advanced_apis/post_training/index.md
Normal file
7
docs/source/advanced_apis/post_training/index.md
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# Post_Training Providers
|
||||
|
||||
This section contains documentation for all available providers for the **post_training** API.
|
||||
|
||||
- [inline::huggingface](inline_huggingface.md)
|
||||
- [inline::torchtune](inline_torchtune.md)
|
||||
- [remote::nvidia](remote_nvidia.md)
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
# inline::huggingface
|
||||
|
||||
## Description
|
||||
|
||||
HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `device` | `<class 'str'>` | No | cuda | |
|
||||
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
|
||||
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
|
||||
| `chat_template` | `<class 'str'>` | No | |
|
||||
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
|
||||
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
|
||||
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
|
||||
| `save_total_limit` | `<class 'int'>` | No | 3 | |
|
||||
| `logging_steps` | `<class 'int'>` | No | 10 | |
|
||||
| `warmup_ratio` | `<class 'float'>` | No | 0.1 | |
|
||||
| `weight_decay` | `<class 'float'>` | No | 0.01 | |
|
||||
| `dataloader_num_workers` | `<class 'int'>` | No | 4 | |
|
||||
| `dataloader_pin_memory` | `<class 'bool'>` | No | True | |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
checkpoint_format: huggingface
|
||||
distributed_backend: null
|
||||
device: cpu
|
||||
|
||||
```
|
||||
|
||||
20
docs/source/advanced_apis/post_training/inline_torchtune.md
Normal file
20
docs/source/advanced_apis/post_training/inline_torchtune.md
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# inline::torchtune
|
||||
|
||||
## Description
|
||||
|
||||
TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `torch_seed` | `int \| None` | No | | |
|
||||
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
checkpoint_format: meta
|
||||
|
||||
```
|
||||
|
||||
28
docs/source/advanced_apis/post_training/remote_nvidia.md
Normal file
28
docs/source/advanced_apis/post_training/remote_nvidia.md
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# remote::nvidia
|
||||
|
||||
## Description
|
||||
|
||||
NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `api_key` | `str \| None` | No | | The NVIDIA API key. |
|
||||
| `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
|
||||
| `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
|
||||
| `customizer_url` | `str \| None` | No | | Base URL for the NeMo Customizer API |
|
||||
| `timeout` | `<class 'int'>` | No | 300 | Timeout for the NVIDIA Post Training API |
|
||||
| `max_retries` | `<class 'int'>` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
|
||||
| `output_model_dir` | `<class 'str'>` | No | test-example-model@v1 | Directory to save the output model |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
api_key: ${env.NVIDIA_API_KEY:=}
|
||||
dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
|
||||
project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
|
||||
customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
|
||||
|
||||
```
|
||||
|
||||
7
docs/source/advanced_apis/scoring/index.md
Normal file
7
docs/source/advanced_apis/scoring/index.md
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# Scoring Providers
|
||||
|
||||
This section contains documentation for all available providers for the **scoring** API.
|
||||
|
||||
- [inline::basic](inline_basic.md)
|
||||
- [inline::braintrust](inline_braintrust.md)
|
||||
- [inline::llm-as-judge](inline_llm-as-judge.md)
|
||||
13
docs/source/advanced_apis/scoring/inline_basic.md
Normal file
13
docs/source/advanced_apis/scoring/inline_basic.md
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
# inline::basic
|
||||
|
||||
## Description
|
||||
|
||||
Basic scoring provider for simple evaluation metrics and scoring functions.
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
{}
|
||||
|
||||
```
|
||||
|
||||
19
docs/source/advanced_apis/scoring/inline_braintrust.md
Normal file
19
docs/source/advanced_apis/scoring/inline_braintrust.md
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# inline::braintrust
|
||||
|
||||
## Description
|
||||
|
||||
Braintrust scoring provider for evaluation and scoring using the Braintrust platform.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `openai_api_key` | `str \| None` | No | | The OpenAI API Key |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
openai_api_key: ${env.OPENAI_API_KEY:=}
|
||||
|
||||
```
|
||||
|
||||
13
docs/source/advanced_apis/scoring/inline_llm-as-judge.md
Normal file
13
docs/source/advanced_apis/scoring/inline_llm-as-judge.md
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
# inline::llm-as-judge
|
||||
|
||||
## Description
|
||||
|
||||
LLM-as-judge scoring provider that uses language models to evaluate and score responses.
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
{}
|
||||
|
||||
```
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
# Building AI Applications (Examples)
|
||||
# AI Application Examples
|
||||
|
||||
Llama Stack provides all the building blocks needed to create sophisticated AI applications.
|
||||
|
||||
|
|
@ -27,4 +27,5 @@ tools
|
|||
evals
|
||||
telemetry
|
||||
safety
|
||||
```
|
||||
playground/index
|
||||
```
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
# Llama Stack Playground
|
||||
## Llama Stack Playground
|
||||
|
||||
```{note}
|
||||
The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
|
||||
|
|
@ -9,7 +9,7 @@ The Llama Stack Playground is an simple interface which aims to:
|
|||
- Demo **end-to-end** application code to help users get started to build their own applications
|
||||
- Provide an **UI** to help users inspect and understand Llama Stack API providers and resources
|
||||
|
||||
## Key Features
|
||||
### Key Features
|
||||
|
||||
#### Playground
|
||||
Interactive pages for users to play with and explore Llama Stack API capabilities.
|
||||
|
|
@ -90,7 +90,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
|
|||
- Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
|
||||
- Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
|
||||
|
||||
## Starting the Llama Stack Playground
|
||||
### Starting the Llama Stack Playground
|
||||
|
||||
To start the Llama Stack Playground, run the following commands:
|
||||
|
||||
|
|
@ -1,31 +1,39 @@
|
|||
# Why Llama Stack?
|
||||
## Llama Stack architecture
|
||||
|
||||
Building production AI applications today requires solving multiple challenges:
|
||||
|
||||
**Infrastructure Complexity**
|
||||
- Running large language models efficiently requires specialized infrastructure.
|
||||
- Different deployment scenarios (local development, cloud, edge) need different solutions.
|
||||
- Moving from development to production often requires significant rework.
|
||||
|
||||
**Essential Capabilities**
|
||||
- Safety guardrails and content filtering are necessary in an enterprise setting.
|
||||
- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
|
||||
- Nearly any application needs composable multi-step workflows.
|
||||
- Finally, without monitoring, observability and evaluation, you end up operating in the dark.
|
||||
|
||||
**Lack of Flexibility and Choice**
|
||||
- Directly integrating with multiple providers creates tight coupling.
|
||||
- Different providers have different APIs and abstractions.
|
||||
- Changing providers requires significant code changes.
|
||||
|
||||
|
||||
### Our Solution: A Universal Stack
|
||||
Llama Stack allows you to build different layers of distributions for your AI workloads using various SDKs and API providers.
|
||||
|
||||
```{image} ../../_static/llama-stack.png
|
||||
:alt: Llama Stack
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
### Benefits of Llama stack
|
||||
|
||||
#### Current challenges in custom AI applications
|
||||
|
||||
Building production AI applications today requires solving multiple challenges:
|
||||
|
||||
**Infrastructure Complexity**
|
||||
|
||||
- Running large language models efficiently requires specialized infrastructure.
|
||||
- Different deployment scenarios (local development, cloud, edge) need different solutions.
|
||||
- Moving from development to production often requires significant rework.
|
||||
|
||||
**Essential Capabilities**
|
||||
|
||||
- Safety guardrails and content filtering are necessary in an enterprise setting.
|
||||
- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
|
||||
- Nearly any application needs composable multi-step workflows.
|
||||
- Without monitoring, observability and evaluation, you end up operating in the dark.
|
||||
|
||||
**Lack of Flexibility and Choice**
|
||||
|
||||
- Directly integrating with multiple providers creates tight coupling.
|
||||
- Different providers have different APIs and abstractions.
|
||||
- Changing providers requires significant code changes.
|
||||
|
||||
#### Our Solution: A Universal Stack
|
||||
|
||||
Llama Stack addresses these challenges through a service-oriented, API-first approach:
|
||||
|
||||
**Develop Anywhere, Deploy Everywhere**
|
||||
|
|
@ -59,4 +67,4 @@ Llama Stack addresses these challenges through a service-oriented, API-first app
|
|||
- **Turnkey Solutions**: Easy to deploy built in solutions for popular deployment scenarios
|
||||
|
||||
|
||||
With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
|
||||
With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
|
||||
|
|
@ -2,6 +2,10 @@
|
|||
|
||||
Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.
|
||||
|
||||
```{include} architecture.md
|
||||
:start-after: ## Llama Stack architecture
|
||||
```
|
||||
|
||||
```{include} apis.md
|
||||
:start-after: ## APIs
|
||||
```
|
||||
|
|
@ -10,14 +14,10 @@ Given Llama Stack's service-oriented philosophy, a few concepts and workflows ar
|
|||
:start-after: ## API Providers
|
||||
```
|
||||
|
||||
```{include} resources.md
|
||||
:start-after: ## Resources
|
||||
```
|
||||
|
||||
```{include} distributions.md
|
||||
:start-after: ## Distributions
|
||||
```
|
||||
|
||||
```{include} evaluation_concepts.md
|
||||
:start-after: ## Evaluation Concepts
|
||||
```{include} resources.md
|
||||
:start-after: ## Resources
|
||||
```
|
||||
|
|
|
|||
|
|
@ -52,7 +52,18 @@ extensions = [
|
|||
"sphinxcontrib.redoc",
|
||||
"sphinxcontrib.mermaid",
|
||||
"sphinxcontrib.video",
|
||||
"sphinx_reredirects"
|
||||
]
|
||||
|
||||
redirects = {
|
||||
"providers/post_training/index": "../../advanced_apis/post_training/index.html",
|
||||
"providers/eval/index": "../../advanced_apis/eval/index.html",
|
||||
"providers/scoring/index": "../../advanced_apis/scoring/index.html",
|
||||
"playground/index": "../../building_applications/playground/index.html",
|
||||
"openai/index": "../../providers/index.html#openai-api-compatibility",
|
||||
"introduction/index": "../concepts/index.html#llama-stack-architecture"
|
||||
}
|
||||
|
||||
myst_enable_extensions = ["colon_fence"]
|
||||
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
|
|
|||
4
docs/source/deploying/index.md
Normal file
4
docs/source/deploying/index.md
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
# Deployment Examples
|
||||
|
||||
```{include} kubernetes_deployment.md
|
||||
```
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
# Kubernetes Deployment Guide
|
||||
## Kubernetes Deployment Guide
|
||||
|
||||
Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
|
||||
|
||||
|
|
@ -145,6 +145,10 @@ $ llama stack build --template starter
|
|||
...
|
||||
You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
|
||||
```
|
||||
|
||||
```{tip}
|
||||
The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
|
||||
```
|
||||
:::
|
||||
:::{tab-item} Building from Scratch
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,10 @@
|
|||
|
||||
The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
|
||||
|
||||
```{note}
|
||||
The default `run.yaml` files generated by templates are starting points for your configuration. For guidance on customizing these files for your specific needs, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
|
||||
```
|
||||
|
||||
```{dropdown} 👋 Click here for a Sample Configuration File
|
||||
|
||||
```yaml
|
||||
|
|
|
|||
40
docs/source/distributions/customizing_run_yaml.md
Normal file
40
docs/source/distributions/customizing_run_yaml.md
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
# Customizing run.yaml Files
|
||||
|
||||
The `run.yaml` files generated by Llama Stack templates are **starting points** designed to be customized for your specific needs. They are not meant to be used as-is in production environments.
|
||||
|
||||
## Key Points
|
||||
|
||||
- **Templates are starting points**: Generated `run.yaml` files contain defaults for development/testing
|
||||
- **Customization expected**: Update URLs, credentials, models, and settings for your environment
|
||||
- **Version control separately**: Keep customized configs in your own repository
|
||||
- **Environment-specific**: Create different configurations for dev, staging, production
|
||||
|
||||
## What You Can Customize
|
||||
|
||||
You can customize:
|
||||
- **Provider endpoints**: Change `http://localhost:8000` to your actual servers
|
||||
- **Swap providers**: Replace default providers (e.g., swap Tavily with Brave for search)
|
||||
- **Storage paths**: Move from `/tmp/` to production directories
|
||||
- **Authentication**: Add API keys, SSL, timeouts
|
||||
- **Models**: Different model sizes for dev vs prod
|
||||
- **Database settings**: Switch from SQLite to PostgreSQL
|
||||
- **Tool configurations**: Add custom tools and integrations
|
||||
|
||||
## Best Practices
|
||||
|
||||
- Use environment variables for secrets and environment-specific values
|
||||
- Create separate `run.yaml` files for different environments (dev, staging, prod)
|
||||
- Document your changes with comments
|
||||
- Test configurations before deployment
|
||||
- Keep your customized configs in version control
|
||||
|
||||
Example structure:
|
||||
```
|
||||
your-project/
|
||||
├── configs/
|
||||
│ ├── dev-run.yaml
|
||||
│ ├── prod-run.yaml
|
||||
└── README.md
|
||||
```
|
||||
|
||||
The goal is to take the generated template and adapt it to your specific infrastructure and operational needs.
|
||||
|
|
@ -6,13 +6,9 @@ This section provides an overview of the distributions available in Llama Stack.
|
|||
|
||||
```{toctree}
|
||||
:maxdepth: 3
|
||||
|
||||
list_of_distributions
|
||||
building_distro
|
||||
customizing_run_yaml
|
||||
importing_as_library
|
||||
configuration
|
||||
list_of_distributions
|
||||
kubernetes_deployment
|
||||
building_distro
|
||||
on_device_distro
|
||||
remote_hosted_distro
|
||||
self_hosted_distro
|
||||
```
|
||||
|
|
|
|||
|
|
@ -28,5 +28,4 @@ If you have built a container image and want to deploy it in a Kubernetes cluste
|
|||
|
||||
importing_as_library
|
||||
configuration
|
||||
kubernetes_deployment
|
||||
```
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# Detailed Tutorial
|
||||
## Detailed Tutorial
|
||||
|
||||
In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple agent.
|
||||
A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with
|
||||
|
|
@ -10,7 +10,7 @@ Llama Stack is a stateful service with REST APIs to support seamless transition
|
|||
In this guide, we'll walk through how to build a RAG agent locally using Llama Stack with [Ollama](https://ollama.com/)
|
||||
as the inference [provider](../providers/index.md#inference) for a Llama Model.
|
||||
|
||||
## Step 1: Installation and Setup
|
||||
### Step 1: Installation and Setup
|
||||
|
||||
Install Ollama by following the instructions on the [Ollama website](https://ollama.com/download), then
|
||||
download Llama 3.2 3B model, and then start the Ollama service.
|
||||
|
|
@ -45,7 +45,7 @@ Setup your virtual environment.
|
|||
uv sync --python 3.12
|
||||
source .venv/bin/activate
|
||||
```
|
||||
## Step 2: Run Llama Stack
|
||||
### Step 2: Run Llama Stack
|
||||
Llama Stack is a server that exposes multiple APIs, you connect with it using the Llama Stack client SDK.
|
||||
|
||||
::::{tab-set}
|
||||
|
|
@ -54,7 +54,7 @@ Llama Stack is a server that exposes multiple APIs, you connect with it using th
|
|||
You can use Python to build and run the Llama Stack server, which is useful for testing and development.
|
||||
|
||||
Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
|
||||
which defines the providers and their settings.
|
||||
which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml.md).
|
||||
Now let's build and run the Llama Stack config for Ollama.
|
||||
We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.
|
||||
|
||||
|
|
@ -77,7 +77,7 @@ ENABLE_OLLAMA=ollama INFERENCE_MODEL="llama3.2:3b" llama stack build --template
|
|||
You can use a container image to run the Llama Stack server. We provide several container images for the server
|
||||
component that works with different inference providers out of the box. For this guide, we will use
|
||||
`llamastack/distribution-starter` as the container image. If you'd like to build your own image or customize the
|
||||
configurations, please check out [this guide](../references/index.md).
|
||||
configurations, please check out [this guide](../distributions/building_distro.md).
|
||||
First lets setup some environment variables and create a local directory to mount into the container’s file system.
|
||||
```bash
|
||||
export INFERENCE_MODEL="llama3.2:3b"
|
||||
|
|
@ -132,7 +132,7 @@ Now you can use the Llama Stack client to run inference and build agents!
|
|||
You can reuse the server setup or use the [Llama Stack Client](https://github.com/meta-llama/llama-stack-client-python/).
|
||||
Note that the client package is already included in the `llama-stack` package.
|
||||
|
||||
## Step 3: Run Client CLI
|
||||
### Step 3: Run Client CLI
|
||||
|
||||
Open a new terminal and navigate to the same directory you started the server from. Then set up a new or activate your
|
||||
existing server virtual environment.
|
||||
|
|
@ -232,7 +232,7 @@ OpenAIChatCompletion(
|
|||
)
|
||||
```
|
||||
|
||||
## Step 4: Run the Demos
|
||||
### Step 4: Run the Demos
|
||||
|
||||
Note that these demos show the [Python Client SDK](../references/python_sdk_reference/index.md).
|
||||
Other SDKs are also available, please refer to the [Client SDK](../index.md#client-sdks) list for the complete options.
|
||||
|
|
@ -242,7 +242,7 @@ Other SDKs are also available, please refer to the [Client SDK](../index.md#clie
|
|||
:::{tab-item} Basic Inference
|
||||
Now you can run inference using the Llama Stack client SDK.
|
||||
|
||||
### i. Create the Script
|
||||
#### i. Create the Script
|
||||
|
||||
Create a file `inference.py` and add the following code:
|
||||
```python
|
||||
|
|
@ -269,7 +269,7 @@ response = client.chat.completions.create(
|
|||
print(response)
|
||||
```
|
||||
|
||||
### ii. Run the Script
|
||||
#### ii. Run the Script
|
||||
Let's run the script using `uv`
|
||||
```bash
|
||||
uv run python inference.py
|
||||
|
|
@ -283,7 +283,7 @@ OpenAIChatCompletion(id='chatcmpl-30cd0f28-a2ad-4b6d-934b-13707fc60ebf', choices
|
|||
|
||||
:::{tab-item} Build a Simple Agent
|
||||
Next we can move beyond simple inference and build an agent that can perform tasks using the Llama Stack server.
|
||||
### i. Create the Script
|
||||
#### i. Create the Script
|
||||
Create a file `agent.py` and add the following code:
|
||||
|
||||
```python
|
||||
|
|
@ -455,7 +455,7 @@ uv run python agent.py
|
|||
|
||||
For our last demo, we can build a RAG agent that can answer questions about the Torchtune project using the documents
|
||||
in a vector database.
|
||||
### i. Create the Script
|
||||
#### i. Create the Script
|
||||
Create a file `rag_agent.py` and add the following code:
|
||||
|
||||
```python
|
||||
|
|
@ -533,7 +533,7 @@ for t in turns:
|
|||
for event in AgentEventLogger().log(stream):
|
||||
event.print()
|
||||
```
|
||||
### ii. Run the Script
|
||||
#### ii. Run the Script
|
||||
Let's run the script using `uv`
|
||||
```bash
|
||||
uv run python rag_agent.py
|
||||
|
|
|
|||
|
|
@ -1,123 +1,13 @@
|
|||
# Quickstart
|
||||
# Getting Started
|
||||
|
||||
Get started with Llama Stack in minutes!
|
||||
|
||||
Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
|
||||
environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
|
||||
|
||||
In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
|
||||
as the inference [provider](../providers/inference/index) for a Llama Model.
|
||||
|
||||
**💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)
|
||||
|
||||
#### Step 1: Install and setup
|
||||
1. Install [uv](https://docs.astral.sh/uv/)
|
||||
2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
|
||||
```bash
|
||||
ollama run llama3.2:3b --keepalive 60m
|
||||
```{include} quickstart.md
|
||||
:start-after: ## Quickstart
|
||||
```
|
||||
#### Step 2: Run the Llama Stack server
|
||||
We will use `uv` to run the Llama Stack server.
|
||||
```bash
|
||||
INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
|
||||
|
||||
```{include} libraries.md
|
||||
:start-after: ## Libraries (SDKs)
|
||||
```
|
||||
#### Step 3: Run the demo
|
||||
Now open up a new terminal and copy the following script into a file named `demo_script.py`.
|
||||
|
||||
```python
|
||||
from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
|
||||
|
||||
vector_db_id = "my_demo_vector_db"
|
||||
client = LlamaStackClient(base_url="http://localhost:8321")
|
||||
|
||||
models = client.models.list()
|
||||
|
||||
# Select the first LLM and first embedding models
|
||||
model_id = next(m for m in models if m.model_type == "llm").identifier
|
||||
embedding_model_id = (
|
||||
em := next(m for m in models if m.model_type == "embedding")
|
||||
).identifier
|
||||
embedding_dimension = em.metadata["embedding_dimension"]
|
||||
|
||||
_ = client.vector_dbs.register(
|
||||
vector_db_id=vector_db_id,
|
||||
embedding_model=embedding_model_id,
|
||||
embedding_dimension=embedding_dimension,
|
||||
provider_id="faiss",
|
||||
)
|
||||
source = "https://www.paulgraham.com/greatwork.html"
|
||||
print("rag_tool> Ingesting document:", source)
|
||||
document = RAGDocument(
|
||||
document_id="document_1",
|
||||
content=source,
|
||||
mime_type="text/html",
|
||||
metadata={},
|
||||
)
|
||||
client.tool_runtime.rag_tool.insert(
|
||||
documents=[document],
|
||||
vector_db_id=vector_db_id,
|
||||
chunk_size_in_tokens=50,
|
||||
)
|
||||
agent = Agent(
|
||||
client,
|
||||
model=model_id,
|
||||
instructions="You are a helpful assistant",
|
||||
tools=[
|
||||
{
|
||||
"name": "builtin::rag/knowledge_search",
|
||||
"args": {"vector_db_ids": [vector_db_id]},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
prompt = "How do you do great work?"
|
||||
print("prompt>", prompt)
|
||||
|
||||
response = agent.create_turn(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
session_id=agent.create_session("rag_session"),
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for log in AgentEventLogger().log(response):
|
||||
log.print()
|
||||
```{include} detailed_tutorial.md
|
||||
:start-after: ## Detailed Tutorial
|
||||
```
|
||||
We will use `uv` to run the script
|
||||
```
|
||||
uv run --with llama-stack-client,fire,requests demo_script.py
|
||||
```
|
||||
And you should see output like below.
|
||||
```
|
||||
rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
|
||||
|
||||
prompt> How do you do great work?
|
||||
|
||||
inference> [knowledge_search(query="What is the key to doing great work")]
|
||||
|
||||
tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
|
||||
|
||||
tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
|
||||
|
||||
inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
|
||||
|
||||
To further clarify, I would suggest that doing great work involves:
|
||||
|
||||
* Completing tasks with high quality and attention to detail
|
||||
* Expanding on existing knowledge or ideas
|
||||
* Making a positive impact on others through your work
|
||||
* Striving for excellence and continuous improvement
|
||||
|
||||
Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
|
||||
```
|
||||
Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
|
||||
|
||||
## Next Steps
|
||||
|
||||
Now you're ready to dive deeper into Llama Stack!
|
||||
- Explore the [Detailed Tutorial](./detailed_tutorial.md).
|
||||
- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
|
||||
- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
|
||||
- Learn about Llama Stack [Concepts](../concepts/index.md).
|
||||
- Discover how to [Build Llama Stacks](../distributions/index.md).
|
||||
- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
|
||||
- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.
|
||||
|
|
|
|||
10
docs/source/getting_started/libraries.md
Normal file
10
docs/source/getting_started/libraries.md
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
## Libraries (SDKs)
|
||||
|
||||
We have a number of client-side SDKs available for different languages.
|
||||
|
||||
| **Language** | **Client SDK** | **Package** |
|
||||
| :----: | :----: | :----: |
|
||||
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [](https://pypi.org/project/llama_stack_client/)
|
||||
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
|
||||
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [](https://npmjs.org/package/llama-stack-client)
|
||||
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
|
||||
123
docs/source/getting_started/quickstart.md
Normal file
123
docs/source/getting_started/quickstart.md
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
## Quickstart
|
||||
|
||||
Get started with Llama Stack in minutes!
|
||||
|
||||
Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
|
||||
environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
|
||||
|
||||
In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
|
||||
as the inference [provider](../providers/inference/index) for a Llama Model.
|
||||
|
||||
**💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)
|
||||
|
||||
#### Step 1: Install and setup
|
||||
1. Install [uv](https://docs.astral.sh/uv/)
|
||||
2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
|
||||
```bash
|
||||
ollama run llama3.2:3b --keepalive 60m
|
||||
```
|
||||
#### Step 2: Run the Llama Stack server
|
||||
We will use `uv` to run the Llama Stack server.
|
||||
```bash
|
||||
INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
|
||||
```
|
||||
#### Step 3: Run the demo
|
||||
Now open up a new terminal and copy the following script into a file named `demo_script.py`.
|
||||
|
||||
```python
|
||||
from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
|
||||
|
||||
vector_db_id = "my_demo_vector_db"
|
||||
client = LlamaStackClient(base_url="http://localhost:8321")
|
||||
|
||||
models = client.models.list()
|
||||
|
||||
# Select the first LLM and first embedding models
|
||||
model_id = next(m for m in models if m.model_type == "llm").identifier
|
||||
embedding_model_id = (
|
||||
em := next(m for m in models if m.model_type == "embedding")
|
||||
).identifier
|
||||
embedding_dimension = em.metadata["embedding_dimension"]
|
||||
|
||||
_ = client.vector_dbs.register(
|
||||
vector_db_id=vector_db_id,
|
||||
embedding_model=embedding_model_id,
|
||||
embedding_dimension=embedding_dimension,
|
||||
provider_id="faiss",
|
||||
)
|
||||
source = "https://www.paulgraham.com/greatwork.html"
|
||||
print("rag_tool> Ingesting document:", source)
|
||||
document = RAGDocument(
|
||||
document_id="document_1",
|
||||
content=source,
|
||||
mime_type="text/html",
|
||||
metadata={},
|
||||
)
|
||||
client.tool_runtime.rag_tool.insert(
|
||||
documents=[document],
|
||||
vector_db_id=vector_db_id,
|
||||
chunk_size_in_tokens=50,
|
||||
)
|
||||
agent = Agent(
|
||||
client,
|
||||
model=model_id,
|
||||
instructions="You are a helpful assistant",
|
||||
tools=[
|
||||
{
|
||||
"name": "builtin::rag/knowledge_search",
|
||||
"args": {"vector_db_ids": [vector_db_id]},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
prompt = "How do you do great work?"
|
||||
print("prompt>", prompt)
|
||||
|
||||
response = agent.create_turn(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
session_id=agent.create_session("rag_session"),
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for log in AgentEventLogger().log(response):
|
||||
log.print()
|
||||
```
|
||||
We will use `uv` to run the script
|
||||
```
|
||||
uv run --with llama-stack-client,fire,requests demo_script.py
|
||||
```
|
||||
And you should see output like below.
|
||||
```
|
||||
rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
|
||||
|
||||
prompt> How do you do great work?
|
||||
|
||||
inference> [knowledge_search(query="What is the key to doing great work")]
|
||||
|
||||
tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
|
||||
|
||||
tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
|
||||
|
||||
inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
|
||||
|
||||
To further clarify, I would suggest that doing great work involves:
|
||||
|
||||
* Completing tasks with high quality and attention to detail
|
||||
* Expanding on existing knowledge or ideas
|
||||
* Making a positive impact on others through your work
|
||||
* Striving for excellence and continuous improvement
|
||||
|
||||
Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
|
||||
```
|
||||
Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
|
||||
|
||||
### Next Steps
|
||||
|
||||
Now you're ready to dive deeper into Llama Stack!
|
||||
- Explore the [Detailed Tutorial](./detailed_tutorial.md).
|
||||
- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
|
||||
- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
|
||||
- Learn about Llama Stack [Concepts](../concepts/index.md).
|
||||
- Discover how to [Build Llama Stacks](../distributions/index.md).
|
||||
- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
|
||||
- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.
|
||||
|
|
@ -40,17 +40,6 @@ Kotlin.
|
|||
- Ready to build? Check out the [Quick Start](getting_started/index) to get started.
|
||||
- Want to contribute? See the [Contributing](contributing/index) guide.
|
||||
|
||||
## Client SDKs
|
||||
|
||||
We have a number of client-side SDKs available for different languages.
|
||||
|
||||
| **Language** | **Client SDK** | **Package** |
|
||||
| :----: | :----: | :----: |
|
||||
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [](https://pypi.org/project/llama_stack_client/)
|
||||
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
|
||||
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [](https://npmjs.org/package/llama-stack-client)
|
||||
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
|
||||
|
||||
## Supported Llama Stack Implementations
|
||||
|
||||
A number of "adapters" are available for some popular Inference and Vector Store providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs.
|
||||
|
|
@ -133,14 +122,12 @@ A number of "adapters" are available for some popular Inference and Vector Store
|
|||
|
||||
self
|
||||
getting_started/index
|
||||
getting_started/detailed_tutorial
|
||||
introduction/index
|
||||
concepts/index
|
||||
openai/index
|
||||
providers/index
|
||||
distributions/index
|
||||
advanced_apis/index
|
||||
building_applications/index
|
||||
playground/index
|
||||
deploying/index
|
||||
contributing/index
|
||||
references/index
|
||||
```
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# Providers Overview
|
||||
# API Providers Overview
|
||||
|
||||
The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
|
||||
- LLM inference providers (e.g., Meta Reference, Ollama, Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, OpenAI, Anthropic, Gemini, WatsonX, etc.),
|
||||
|
|
@ -13,13 +13,25 @@ Providers come in two flavors:
|
|||
Importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.
|
||||
|
||||
## External Providers
|
||||
|
||||
Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
external
|
||||
external.md
|
||||
```
|
||||
|
||||
```{include} openai.md
|
||||
:start-after: ## OpenAI API Compatibility
|
||||
```
|
||||
|
||||
## Inference
|
||||
Runs inference with an LLM.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
inference/index
|
||||
```
|
||||
|
||||
## Agents
|
||||
|
|
@ -40,33 +52,6 @@ Interfaces with datasets and data loaders.
|
|||
datasetio/index
|
||||
```
|
||||
|
||||
## Eval
|
||||
Generates outputs (via Inference or Agents) and perform scoring.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
eval/index
|
||||
```
|
||||
|
||||
## Inference
|
||||
Runs inference with an LLM.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
inference/index
|
||||
```
|
||||
|
||||
## Post Training
|
||||
Fine-tunes a model.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
post_training/index
|
||||
```
|
||||
|
||||
## Safety
|
||||
Applies safety policies to the output at a Systems (not only model) level.
|
||||
|
||||
|
|
@ -76,15 +61,6 @@ Applies safety policies to the output at a Systems (not only model) level.
|
|||
safety/index
|
||||
```
|
||||
|
||||
## Scoring
|
||||
Evaluates the outputs of the system.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
scoring/index
|
||||
```
|
||||
|
||||
## Telemetry
|
||||
Collects telemetry data from the system.
|
||||
|
||||
|
|
@ -94,15 +70,6 @@ Collects telemetry data from the system.
|
|||
telemetry/index
|
||||
```
|
||||
|
||||
## Tool Runtime
|
||||
Is associated with the ToolGroup resouces.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
tool_runtime/index
|
||||
```
|
||||
|
||||
## Vector IO
|
||||
|
||||
Vector IO refers to operations on vector databases, such as adding documents, searching, and deleting documents.
|
||||
|
|
@ -114,3 +81,12 @@ io and database are used to store and retrieve documents for retrieval.
|
|||
|
||||
vector_io/index
|
||||
```
|
||||
|
||||
## Tool Runtime
|
||||
Is associated with the ToolGroup resources.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
tool_runtime/index
|
||||
```
|
||||
|
|
@ -1,14 +1,14 @@
|
|||
# OpenAI API Compatibility
|
||||
## OpenAI API Compatibility
|
||||
|
||||
## Server path
|
||||
### Server path
|
||||
|
||||
Llama Stack exposes an OpenAI-compatible API endpoint at `/v1/openai/v1`. So, for a Llama Stack server running locally on port `8321`, the full url to the OpenAI-compatible API endpoint is `http://localhost:8321/v1/openai/v1`.
|
||||
|
||||
## Clients
|
||||
### Clients
|
||||
|
||||
You should be able to use any client that speaks OpenAI APIs with Llama Stack. We regularly test with the official Llama Stack clients as well as OpenAI's official Python client.
|
||||
|
||||
### Llama Stack Client
|
||||
#### Llama Stack Client
|
||||
|
||||
When using the Llama Stack client, set the `base_url` to the root of your Llama Stack server. It will automatically route OpenAI-compatible requests to the right server endpoint for you.
|
||||
|
||||
|
|
@ -18,7 +18,7 @@ from llama_stack_client import LlamaStackClient
|
|||
client = LlamaStackClient(base_url="http://localhost:8321")
|
||||
```
|
||||
|
||||
### OpenAI Client
|
||||
#### OpenAI Client
|
||||
|
||||
When using an OpenAI client, set the `base_url` to the `/v1/openai/v1` path on your Llama Stack server.
|
||||
|
||||
|
|
@ -30,9 +30,9 @@ client = OpenAI(base_url="http://localhost:8321/v1/openai/v1", api_key="none")
|
|||
|
||||
Regardless of the client you choose, the following code examples should all work the same.
|
||||
|
||||
## APIs implemented
|
||||
### APIs implemented
|
||||
|
||||
### Models
|
||||
#### Models
|
||||
|
||||
Many of the APIs require you to pass in a model parameter. To see the list of models available in your Llama Stack server:
|
||||
|
||||
|
|
@ -40,13 +40,13 @@ Many of the APIs require you to pass in a model parameter. To see the list of mo
|
|||
models = client.models.list()
|
||||
```
|
||||
|
||||
### Responses
|
||||
#### Responses
|
||||
|
||||
:::{note}
|
||||
The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. We'd love feedback on any use-cases you try that do not work to help prioritize the pieces left to implement. Please open issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work.
|
||||
:::
|
||||
|
||||
#### Simple inference
|
||||
##### Simple inference
|
||||
|
||||
Request:
|
||||
|
||||
|
|
@ -66,7 +66,7 @@ Syntax whispers secrets sweet
|
|||
Code's gentle silence
|
||||
```
|
||||
|
||||
#### Structured Output
|
||||
##### Structured Output
|
||||
|
||||
Request:
|
||||
|
||||
|
|
@ -106,9 +106,9 @@ Example output:
|
|||
{ "participants": ["Alice", "Bob"] }
|
||||
```
|
||||
|
||||
### Chat Completions
|
||||
#### Chat Completions
|
||||
|
||||
#### Simple inference
|
||||
##### Simple inference
|
||||
|
||||
Request:
|
||||
|
||||
|
|
@ -129,7 +129,7 @@ Logic flows like a river
|
|||
Code's gentle beauty
|
||||
```
|
||||
|
||||
#### Structured Output
|
||||
##### Structured Output
|
||||
|
||||
Request:
|
||||
|
||||
|
|
@ -170,9 +170,9 @@ Example output:
|
|||
{ "participants": ["Alice", "Bob"] }
|
||||
```
|
||||
|
||||
### Completions
|
||||
#### Completions
|
||||
|
||||
#### Simple inference
|
||||
##### Simple inference
|
||||
|
||||
Request:
|
||||
|
||||
|
|
@ -114,7 +114,7 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
|
|||
| `uri` | `<class 'str'>` | No | PydanticUndefined | The URI of the Milvus server |
|
||||
| `token` | `str \| None` | No | PydanticUndefined | The token of the Milvus server |
|
||||
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
|
||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No | | Config for KV store backend (SQLite only for now) |
|
||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
|
||||
| `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
|
||||
|
||||
> **Note**: This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
|
||||
|
|
@ -124,6 +124,9 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
|
|||
```yaml
|
||||
uri: ${env.MILVUS_ENDPOINT}
|
||||
token: ${env.MILVUS_TOKEN}
|
||||
kvstore:
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_remote_registry.db
|
||||
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@ See [PGVector's documentation](https://github.com/pgvector/pgvector) for more de
|
|||
| `db` | `str \| None` | No | postgres | |
|
||||
| `user` | `str \| None` | No | postgres | |
|
||||
| `password` | `str \| None` | No | mysecretpassword | |
|
||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No | | Config for KV store backend (SQLite only for now) |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
|
|
@ -49,6 +50,9 @@ port: ${env.PGVECTOR_PORT:=5432}
|
|||
db: ${env.PGVECTOR_DB}
|
||||
user: ${env.PGVECTOR_USER}
|
||||
password: ${env.PGVECTOR_PASSWORD}
|
||||
kvstore:
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/pgvector_registry.db
|
||||
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,9 @@ See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more
|
|||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
{}
|
||||
kvstore:
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/weaviate_registry.db
|
||||
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,8 @@ The `llama-stack-client` CLI allows you to query information about the distribut
|
|||
llama-stack-client
|
||||
Usage: llama-stack-client [OPTIONS] COMMAND [ARGS]...
|
||||
|
||||
Welcome to the LlamaStackClient CLI
|
||||
Welcome to the llama-stack-client CLI - a command-line interface for
|
||||
interacting with Llama Stack
|
||||
|
||||
Options:
|
||||
--version Show the version and exit.
|
||||
|
|
@ -35,6 +36,7 @@ Commands:
|
|||
```
|
||||
|
||||
### `llama-stack-client configure`
|
||||
Configure Llama Stack Client CLI.
|
||||
```bash
|
||||
llama-stack-client configure
|
||||
> Enter the host name of the Llama Stack distribution server: localhost
|
||||
|
|
@ -42,7 +44,24 @@ llama-stack-client configure
|
|||
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
|
||||
```
|
||||
|
||||
Optional arguments:
|
||||
- `--endpoint`: Llama Stack distribution endpoint
|
||||
- `--api-key`: Llama Stack distribution API key
|
||||
|
||||
|
||||
|
||||
## `llama-stack-client inspect version`
|
||||
Inspect server configuration.
|
||||
```bash
|
||||
llama-stack-client inspect version
|
||||
```
|
||||
```bash
|
||||
VersionInfo(version='0.2.14')
|
||||
```
|
||||
|
||||
|
||||
### `llama-stack-client providers list`
|
||||
Show available providers on distribution endpoint
|
||||
```bash
|
||||
llama-stack-client providers list
|
||||
```
|
||||
|
|
@ -66,9 +85,74 @@ llama-stack-client providers list
|
|||
+-----------+----------------+-----------------+
|
||||
```
|
||||
|
||||
### `llama-stack-client providers inspect`
|
||||
Show specific provider configuration on distribution endpoint
|
||||
```bash
|
||||
llama-stack-client providers inspect <provider_id>
|
||||
```
|
||||
|
||||
|
||||
## Inference
|
||||
Inference (chat).
|
||||
|
||||
|
||||
### `llama-stack-client inference chat-completion`
|
||||
Show available inference chat completion endpoints on distribution endpoint
|
||||
```bash
|
||||
llama-stack-client inference chat-completion --message <message> [--stream] [--session] [--model-id]
|
||||
```
|
||||
```bash
|
||||
OpenAIChatCompletion(
|
||||
id='chatcmpl-aacd11f3-8899-4ec5-ac5b-e655132f6891',
|
||||
choices=[
|
||||
OpenAIChatCompletionChoice(
|
||||
finish_reason='stop',
|
||||
index=0,
|
||||
message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
|
||||
role='assistant',
|
||||
content='The captain of the whaleship Pequod in Nathaniel Hawthorne\'s novel "Moby-Dick" is Captain
|
||||
Ahab. He\'s a vengeful and obsessive old sailor who\'s determined to hunt down and kill the white sperm whale
|
||||
Moby-Dick, whom he\'s lost his leg to in a previous encounter.',
|
||||
name=None,
|
||||
tool_calls=None,
|
||||
refusal=None,
|
||||
annotations=None,
|
||||
audio=None,
|
||||
function_call=None
|
||||
),
|
||||
logprobs=None
|
||||
)
|
||||
],
|
||||
created=1752578797,
|
||||
model='llama3.2:3b-instruct-fp16',
|
||||
object='chat.completion',
|
||||
service_tier=None,
|
||||
system_fingerprint='fp_ollama',
|
||||
usage={
|
||||
'completion_tokens': 67,
|
||||
'prompt_tokens': 33,
|
||||
'total_tokens': 100,
|
||||
'completion_tokens_details': None,
|
||||
'prompt_tokens_details': None
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
Required arguments:
|
||||
**Note:** At least one of these parameters is required for chat completion
|
||||
- `--message`: Message
|
||||
- `--session`: Start a Chat Session
|
||||
|
||||
Optional arguments:
|
||||
- `--stream`: Stream
|
||||
- `--model-id`: Model ID
|
||||
|
||||
## Model Management
|
||||
Manage GenAI models.
|
||||
|
||||
|
||||
### `llama-stack-client models list`
|
||||
Show available llama models at distribution endpoint
|
||||
```bash
|
||||
llama-stack-client models list
|
||||
```
|
||||
|
|
@ -85,6 +169,7 @@ Total models: 1
|
|||
```
|
||||
|
||||
### `llama-stack-client models get`
|
||||
Show details of a specific model at the distribution endpoint
|
||||
```bash
|
||||
llama-stack-client models get Llama3.1-8B-Instruct
|
||||
```
|
||||
|
|
@ -105,69 +190,92 @@ Model RandomModel is not found at distribution endpoint host:port. Please ensure
|
|||
```
|
||||
|
||||
### `llama-stack-client models register`
|
||||
|
||||
Register a new model at distribution endpoint
|
||||
```bash
|
||||
llama-stack-client models register <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
|
||||
llama-stack-client models register <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>] [--model-type <model_type>]
|
||||
```
|
||||
|
||||
### `llama-stack-client models update`
|
||||
Required arguments:
|
||||
- `MODEL_ID`: Model ID
|
||||
- `--provider-id`: Provider ID for the model
|
||||
|
||||
Optional arguments:
|
||||
- `--provider-model-id`: Provider's model ID
|
||||
- `--metadata`: JSON metadata for the model
|
||||
- `--model-type`: Model type: `llm`, `embedding`
|
||||
|
||||
|
||||
### `llama-stack-client models unregister`
|
||||
Unregister a model from distribution endpoint
|
||||
```bash
|
||||
llama-stack-client models update <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
|
||||
```
|
||||
|
||||
### `llama-stack-client models delete`
|
||||
|
||||
```bash
|
||||
llama-stack-client models delete <model_id>
|
||||
llama-stack-client models unregister <model_id>
|
||||
```
|
||||
|
||||
## Vector DB Management
|
||||
Manage vector databases.
|
||||
|
||||
|
||||
### `llama-stack-client vector_dbs list`
|
||||
Show available vector dbs on distribution endpoint
|
||||
```bash
|
||||
llama-stack-client vector_dbs list
|
||||
```
|
||||
```
|
||||
+--------------+----------------+---------------------+---------------+------------------------+
|
||||
| identifier | provider_id | provider_resource_id| vector_db_type| params |
|
||||
+==============+================+=====================+===============+========================+
|
||||
| test_bank | meta-reference | test_bank | vector | embedding_model: all-MiniLM-L6-v2
|
||||
embedding_dimension: 384|
|
||||
+--------------+----------------+---------------------+---------------+------------------------+
|
||||
┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ identifier ┃ provider_id ┃ provider_resource_id ┃ vector_db_type ┃ params ┃
|
||||
┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ my_demo_vector_db │ faiss │ my_demo_vector_db │ │ embedding_dimension: 384 │
|
||||
│ │ │ │ │ embedding_model: all-MiniLM-L6-v2 │
|
||||
│ │ │ │ │ type: vector_db │
|
||||
│ │ │ │ │ │
|
||||
└──────────────────────────┴─────────────┴──────────────────────────┴────────────────┴───────────────────────────────────┘
|
||||
```
|
||||
|
||||
### `llama-stack-client vector_dbs register`
|
||||
Create a new vector db
|
||||
```bash
|
||||
llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `VECTOR_DB_ID`: Vector DB ID
|
||||
|
||||
Optional arguments:
|
||||
- `--provider-id`: Provider ID for the vector db
|
||||
- `--provider-vector-db-id`: Provider's vector db ID
|
||||
- `--embedding-model`: Embedding model to use. Default: "all-MiniLM-L6-v2"
|
||||
- `--embedding-model`: Embedding model to use. Default: `all-MiniLM-L6-v2`
|
||||
- `--embedding-dimension`: Dimension of embeddings. Default: 384
|
||||
|
||||
### `llama-stack-client vector_dbs unregister`
|
||||
Delete a vector db
|
||||
```bash
|
||||
llama-stack-client vector_dbs unregister <vector-db-id>
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `VECTOR_DB_ID`: Vector DB ID
|
||||
|
||||
|
||||
## Shield Management
|
||||
Manage safety shield services.
|
||||
### `llama-stack-client shields list`
|
||||
Show available safety shields on distribution endpoint
|
||||
```bash
|
||||
llama-stack-client shields list
|
||||
```
|
||||
|
||||
```
|
||||
+--------------+----------+----------------+-------------+
|
||||
| identifier | params | provider_id | type |
|
||||
+==============+==========+================+=============+
|
||||
| llama_guard | {} | meta-reference | llama_guard |
|
||||
+--------------+----------+----------------+-------------+
|
||||
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ identifier ┃ provider_alias ┃ params ┃ provider_id ┃
|
||||
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ ollama │ ollama/llama-guard3:1b │ │ llama-guard │
|
||||
└──────────────────────────────────┴───────────────────────────────────────────────────────────────────────┴───────────────────────┴────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### `llama-stack-client shields register`
|
||||
Register a new safety shield
|
||||
```bash
|
||||
llama-stack-client shields register --shield-id <shield-id> [--provider-id <provider-id>] [--provider-shield-id <provider-shield-id>] [--params <params>]
|
||||
```
|
||||
|
|
@ -180,41 +288,29 @@ Optional arguments:
|
|||
- `--provider-shield-id`: Provider's shield ID
|
||||
- `--params`: JSON configuration parameters for the shield
|
||||
|
||||
## Eval Task Management
|
||||
|
||||
### `llama-stack-client benchmarks list`
|
||||
```bash
|
||||
llama-stack-client benchmarks list
|
||||
```
|
||||
|
||||
### `llama-stack-client benchmarks register`
|
||||
```bash
|
||||
llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
|
||||
```
|
||||
|
||||
Required arguments:
|
||||
- `--eval-task-id`: ID of the eval task
|
||||
- `--dataset-id`: ID of the dataset to evaluate
|
||||
- `--scoring-functions`: One or more scoring functions to use for evaluation
|
||||
|
||||
Optional arguments:
|
||||
- `--provider-id`: Provider ID for the eval task
|
||||
- `--provider-eval-task-id`: Provider's eval task ID
|
||||
- `--metadata`: Metadata for the eval task in JSON format
|
||||
|
||||
## Eval execution
|
||||
Run evaluation tasks.
|
||||
|
||||
|
||||
### `llama-stack-client eval run-benchmark`
|
||||
Run a evaluation benchmark task
|
||||
```bash
|
||||
llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
|
||||
llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> --model-id <model-id> [--num-examples <num>] [--visualize] [--repeat-penalty <repeat-penalty>] [--top-p <top-p>] [--max-tokens <max-tokens>]
|
||||
```
|
||||
|
||||
Required arguments:
|
||||
- `--eval-task-config`: Path to the eval task config file in JSON format
|
||||
- `--output-dir`: Path to the directory where evaluation results will be saved
|
||||
- `--model-id`: model id to run the benchmark eval on
|
||||
|
||||
Optional arguments:
|
||||
- `--num-examples`: Number of examples to evaluate (useful for debugging)
|
||||
- `--visualize`: If set, visualizes evaluation results after completion
|
||||
- `--repeat-penalty`: repeat-penalty in the sampling params to run generation
|
||||
- `--top-p`: top-p in the sampling params to run generation
|
||||
- `--max-tokens`: max-tokens in the sampling params to run generation
|
||||
- `--temperature`: temperature in the sampling params to run generation
|
||||
|
||||
Example benchmark_config.json:
|
||||
```json
|
||||
|
|
@ -231,21 +327,55 @@ Example benchmark_config.json:
|
|||
```
|
||||
|
||||
### `llama-stack-client eval run-scoring`
|
||||
Run scoring from application datasets
|
||||
```bash
|
||||
llama-stack-client eval run-scoring <eval-task-id> --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
|
||||
llama-stack-client eval run-scoring <eval-task-id> --output-dir <output-dir> [--num-examples <num>] [--visualize]
|
||||
```
|
||||
|
||||
Required arguments:
|
||||
- `--eval-task-config`: Path to the eval task config file in JSON format
|
||||
- `--output-dir`: Path to the directory where scoring results will be saved
|
||||
|
||||
Optional arguments:
|
||||
- `--num-examples`: Number of examples to evaluate (useful for debugging)
|
||||
- `--visualize`: If set, visualizes scoring results after completion
|
||||
- `--scoring-params-config`: Path to the scoring params config file in JSON format
|
||||
- `--dataset-id`: Pre-registered dataset_id to score (from llama-stack-client datasets list)
|
||||
- `--dataset-path`: Path to the dataset file to score
|
||||
|
||||
|
||||
## Eval Tasks
|
||||
Manage evaluation tasks.
|
||||
|
||||
### `llama-stack-client eval_tasks list`
|
||||
Show available eval tasks on distribution endpoint
|
||||
```bash
|
||||
llama-stack-client eval_tasks list
|
||||
```
|
||||
|
||||
|
||||
### `llama-stack-client eval_tasks register`
|
||||
Register a new eval task
|
||||
```bash
|
||||
llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <scoring-functions> [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `--eval-task-id`: ID of the eval task
|
||||
- `--dataset-id`: ID of the dataset to evaluate
|
||||
- `--scoring-functions`: Scoring functions to use for evaluation
|
||||
|
||||
Optional arguments:
|
||||
- `--provider-id`: Provider ID for the eval task
|
||||
- `--provider-eval-task-id`: Provider's eval task ID
|
||||
|
||||
|
||||
## Tool Group Management
|
||||
Manage available tool groups.
|
||||
|
||||
|
||||
### `llama-stack-client toolgroups list`
|
||||
Show available llama toolgroups at distribution endpoint
|
||||
```bash
|
||||
llama-stack-client toolgroups list
|
||||
```
|
||||
|
|
@ -260,17 +390,28 @@ llama-stack-client toolgroups list
|
|||
```
|
||||
|
||||
### `llama-stack-client toolgroups get`
|
||||
Get available llama toolgroups by id
|
||||
```bash
|
||||
llama-stack-client toolgroups get <toolgroup_id>
|
||||
```
|
||||
|
||||
Shows detailed information about a specific toolgroup. If the toolgroup is not found, displays an error message.
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `TOOLGROUP_ID`: ID of the tool group
|
||||
|
||||
|
||||
### `llama-stack-client toolgroups register`
|
||||
Register a new toolgroup at distribution endpoint
|
||||
```bash
|
||||
llama-stack-client toolgroups register <toolgroup_id> [--provider-id <provider-id>] [--provider-toolgroup-id <provider-toolgroup-id>] [--mcp-config <mcp-config>] [--args <args>]
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `TOOLGROUP_ID`: ID of the tool group
|
||||
|
||||
Optional arguments:
|
||||
- `--provider-id`: Provider ID for the toolgroup
|
||||
- `--provider-toolgroup-id`: Provider's toolgroup ID
|
||||
|
|
@ -278,6 +419,172 @@ Optional arguments:
|
|||
- `--args`: JSON arguments for the toolgroup
|
||||
|
||||
### `llama-stack-client toolgroups unregister`
|
||||
Unregister a toolgroup from distribution endpoint
|
||||
```bash
|
||||
llama-stack-client toolgroups unregister <toolgroup_id>
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `TOOLGROUP_ID`: ID of the tool group
|
||||
|
||||
|
||||
## Datasets Management
|
||||
Manage datasets.
|
||||
|
||||
|
||||
### `llama-stack-client datasets list`
|
||||
Show available datasets on distribution endpoint
|
||||
```bash
|
||||
llama-stack-client datasets list
|
||||
```
|
||||
|
||||
|
||||
### `llama-stack-client datasets register`
|
||||
```bash
|
||||
llama-stack-client datasets register --dataset_id <dataset_id> --purpose <purpose> [--url <url] [--dataset-path <dataset-path>] [--dataset-id <dataset-id>] [--metadata <metadata>]
|
||||
```
|
||||
|
||||
Required arguments:
|
||||
- `--dataset_id`: Id of the dataset
|
||||
- `--purpose`: Purpose of the dataset
|
||||
|
||||
Optional arguments:
|
||||
- `--metadata`: Metadata of the dataset
|
||||
- `--url`: URL of the dataset
|
||||
- `--dataset-path`: Local file path to the dataset. If specified, upload dataset via URL
|
||||
|
||||
|
||||
### `llama-stack-client datasets unregister`
|
||||
Remove a dataset
|
||||
```bash
|
||||
llama-stack-client datasets unregister <dataset-id>
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `DATASET_ID`: Id of the dataset
|
||||
|
||||
|
||||
## Scoring Functions Management
|
||||
Manage scoring functions.
|
||||
|
||||
### `llama-stack-client scoring_functions list`
|
||||
Show available scoring functions on distribution endpoint
|
||||
```bash
|
||||
llama-stack-client scoring_functions list
|
||||
```
|
||||
```
|
||||
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
|
||||
┃ identifier ┃ provider_id ┃ description ┃ type ┃
|
||||
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
|
||||
│ basic::bfcl │ basic │ BFCL complex scoring │ scoring_function │
|
||||
│ basic::docvqa │ basic │ DocVQA Visual Question & Answer scoring function │ scoring_function │
|
||||
│ basic::equality │ basic │ Returns 1.0 if the input is equal to the target, 0.0 │ scoring_function │
|
||||
│ │ │ otherwise. │ │
|
||||
└────────────────────────────────────────────┴──────────────┴───────────────────────────────────────────────────────────────┴──────────────────┘
|
||||
```
|
||||
|
||||
|
||||
### `llama-stack-client scoring_functions register`
|
||||
Register a new scoring function
|
||||
```bash
|
||||
llama-stack-client scoring_functions register --scoring-fn-id <scoring-fn-id> --description <description> --return-type <return-type> [--provider-id <provider-id>] [--provider-scoring-fn-id <provider-scoring-fn-id>] [--params <params>]
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `--scoring-fn-id`: Id of the scoring function
|
||||
- `--description`: Description of the scoring function
|
||||
- `--return-type`: Return type of the scoring function
|
||||
|
||||
Optional arguments:
|
||||
- `--provider-id`: Provider ID for the scoring function
|
||||
- `--provider-scoring-fn-id`: Provider's scoring function ID
|
||||
- `--params`: Parameters for the scoring function in JSON format
|
||||
|
||||
|
||||
## Post Training Management
|
||||
Post-training.
|
||||
|
||||
### `llama-stack-client post_training list`
|
||||
Show the list of available post training jobs
|
||||
```bash
|
||||
llama-stack-client post_training list
|
||||
```
|
||||
```bash
|
||||
["job-1", "job-2", "job-3"]
|
||||
```
|
||||
|
||||
|
||||
### `llama-stack-client post_training artifacts`
|
||||
Get the training artifacts of a specific post training job
|
||||
```bash
|
||||
llama-stack-client post_training artifacts --job-uuid <job-uuid>
|
||||
```
|
||||
```bash
|
||||
JobArtifactsResponse(checkpoints=[], job_uuid='job-1')
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `--job-uuid`: Job UUID
|
||||
|
||||
|
||||
### `llama-stack-client post_training supervised_fine_tune`
|
||||
Kick off a supervised fine tune job
|
||||
```bash
|
||||
llama-stack-client post_training supervised_fine_tune --job-uuid <job-uuid> --model <model> --algorithm-config <algorithm-config> --training-config <training-config> [--checkpoint-dir <checkpoint-dir>]
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `--job-uuid`: Job UUID
|
||||
- `--model`: Model ID
|
||||
- `--algorithm-config`: Algorithm Config
|
||||
- `--training-config`: Training Config
|
||||
|
||||
Optional arguments:
|
||||
- `--checkpoint-dir`: Checkpoint Config
|
||||
|
||||
|
||||
### `llama-stack-client post_training status`
|
||||
Show the status of a specific post training job
|
||||
```bash
|
||||
llama-stack-client post_training status --job-uuid <job-uuid>
|
||||
```
|
||||
```bash
|
||||
JobStatusResponse(
|
||||
checkpoints=[],
|
||||
job_uuid='job-1',
|
||||
status='completed',
|
||||
completed_at="",
|
||||
resources_allocated="",
|
||||
scheduled_at="",
|
||||
started_at=""
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `--job-uuid`: Job UUID
|
||||
|
||||
|
||||
### `llama-stack-client post_training cancel`
|
||||
Cancel the training job
|
||||
```bash
|
||||
llama-stack-client post_training cancel --job-uuid <job-uuid>
|
||||
```
|
||||
```bash
|
||||
# This functionality is not yet implemented for llama-stack-client
|
||||
╭────────────────────────────────────────────────────────────╮
|
||||
│ Failed to post_training cancel_training_job │
|
||||
│ │
|
||||
│ Error Type: InternalServerError │
|
||||
│ Details: Error code: 501 - {'detail': 'Not implemented: '} │
|
||||
╰────────────────────────────────────────────────────────────╯
|
||||
```
|
||||
|
||||
|
||||
Required arguments:
|
||||
- `--job-uuid`: Job UUID
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue