mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-16 18:08:09 +00:00
docs: Reorganize documentation on the webpage (#2651)
Some checks failed
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 2s
Integration Tests / discover-tests (push) Successful in 2s
Vector IO Integration Tests / test-matrix (3.12, inline::milvus) (push) Failing after 17s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 19s
Python Package Build Test / build (3.12) (push) Failing after 14s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 14s
Vector IO Integration Tests / test-matrix (3.12, remote::pgvector) (push) Failing after 15s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 20s
Unit Tests / unit-tests (3.13) (push) Failing after 15s
Test Llama Stack Build / generate-matrix (push) Successful in 16s
Vector IO Integration Tests / test-matrix (3.13, remote::pgvector) (push) Failing after 20s
Test External Providers / test-external-providers (venv) (push) Failing after 17s
Update ReadTheDocs / update-readthedocs (push) Failing after 15s
Test Llama Stack Build / build-single-provider (push) Failing after 21s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 18s
Unit Tests / unit-tests (3.12) (push) Failing after 22s
Vector IO Integration Tests / test-matrix (3.12, inline::sqlite-vec) (push) Failing after 25s
Vector IO Integration Tests / test-matrix (3.13, remote::chromadb) (push) Failing after 23s
Vector IO Integration Tests / test-matrix (3.13, inline::milvus) (push) Failing after 26s
Vector IO Integration Tests / test-matrix (3.13, inline::sqlite-vec) (push) Failing after 19s
Vector IO Integration Tests / test-matrix (3.12, inline::faiss) (push) Failing after 28s
Vector IO Integration Tests / test-matrix (3.13, inline::faiss) (push) Failing after 21s
Vector IO Integration Tests / test-matrix (3.12, remote::chromadb) (push) Failing after 23s
Python Package Build Test / build (3.13) (push) Failing after 44s
Test Llama Stack Build / build (push) Failing after 25s
Integration Tests / test-matrix (push) Failing after 46s
Pre-commit / pre-commit (push) Successful in 2m24s
Some checks failed
SqlStore Integration Tests / test-postgres (3.13) (push) Failing after 2s
Integration Tests / discover-tests (push) Successful in 2s
Vector IO Integration Tests / test-matrix (3.12, inline::milvus) (push) Failing after 17s
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 19s
Python Package Build Test / build (3.12) (push) Failing after 14s
Test Llama Stack Build / build-custom-container-distribution (push) Failing after 14s
Vector IO Integration Tests / test-matrix (3.12, remote::pgvector) (push) Failing after 15s
SqlStore Integration Tests / test-postgres (3.12) (push) Failing after 20s
Unit Tests / unit-tests (3.13) (push) Failing after 15s
Test Llama Stack Build / generate-matrix (push) Successful in 16s
Vector IO Integration Tests / test-matrix (3.13, remote::pgvector) (push) Failing after 20s
Test External Providers / test-external-providers (venv) (push) Failing after 17s
Update ReadTheDocs / update-readthedocs (push) Failing after 15s
Test Llama Stack Build / build-single-provider (push) Failing after 21s
Test Llama Stack Build / build-ubi9-container-distribution (push) Failing after 18s
Unit Tests / unit-tests (3.12) (push) Failing after 22s
Vector IO Integration Tests / test-matrix (3.12, inline::sqlite-vec) (push) Failing after 25s
Vector IO Integration Tests / test-matrix (3.13, remote::chromadb) (push) Failing after 23s
Vector IO Integration Tests / test-matrix (3.13, inline::milvus) (push) Failing after 26s
Vector IO Integration Tests / test-matrix (3.13, inline::sqlite-vec) (push) Failing after 19s
Vector IO Integration Tests / test-matrix (3.12, inline::faiss) (push) Failing after 28s
Vector IO Integration Tests / test-matrix (3.13, inline::faiss) (push) Failing after 21s
Vector IO Integration Tests / test-matrix (3.12, remote::chromadb) (push) Failing after 23s
Python Package Build Test / build (3.13) (push) Failing after 44s
Test Llama Stack Build / build (push) Failing after 25s
Integration Tests / test-matrix (push) Failing after 46s
Pre-commit / pre-commit (push) Successful in 2m24s
# What does this PR do? Reorganizes the Llama stack webpage into more concise index pages, introduce more of a workflow, and reduce repetition of content. New nav structure so far based on #2637 Further discussions in https://github.com/meta-llama/llama-stack/discussions/2585 **Preview:**  You can also build a full local preview locally **Feedback** Looking for feedback on page titles and general feedback on the new structure **Follow up documentation** I plan on reducing some sections and standardizing some terminology in a follow up PR. More discussions on that in https://github.com/meta-llama/llama-stack/discussions/2585
This commit is contained in:
parent
e1755d1ed2
commit
b096794959
34 changed files with 487 additions and 249 deletions
6
docs/source/advanced_apis/eval/index.md
Normal file
6
docs/source/advanced_apis/eval/index.md
Normal file
|
@ -0,0 +1,6 @@
|
|||
# Eval Providers
|
||||
|
||||
This section contains documentation for all available providers for the **eval** API.
|
||||
|
||||
- [inline::meta-reference](inline_meta-reference.md)
|
||||
- [remote::nvidia](remote_nvidia.md)
|
21
docs/source/advanced_apis/eval/inline_meta-reference.md
Normal file
21
docs/source/advanced_apis/eval/inline_meta-reference.md
Normal file
|
@ -0,0 +1,21 @@
|
|||
# inline::meta-reference
|
||||
|
||||
## Description
|
||||
|
||||
Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
kvstore:
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
|
||||
|
||||
```
|
||||
|
19
docs/source/advanced_apis/eval/remote_nvidia.md
Normal file
19
docs/source/advanced_apis/eval/remote_nvidia.md
Normal file
|
@ -0,0 +1,19 @@
|
|||
# remote::nvidia
|
||||
|
||||
## Description
|
||||
|
||||
NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `evaluator_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
|
||||
|
||||
```
|
||||
|
33
docs/source/advanced_apis/index.md
Normal file
33
docs/source/advanced_apis/index.md
Normal file
|
@ -0,0 +1,33 @@
|
|||
# Advanced APIs
|
||||
|
||||
## Post-training
|
||||
Fine-tunes a model.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
post_training/index
|
||||
```
|
||||
|
||||
## Eval
|
||||
Generates outputs (via Inference or Agents) and perform scoring.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
eval/index
|
||||
```
|
||||
|
||||
```{include} evaluation_concepts.md
|
||||
:start-after: ## Evaluation Concepts
|
||||
```
|
||||
|
||||
## Scoring
|
||||
Evaluates the outputs of the system.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
scoring/index
|
||||
```
|
||||
|
7
docs/source/advanced_apis/post_training/index.md
Normal file
7
docs/source/advanced_apis/post_training/index.md
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Post_Training Providers
|
||||
|
||||
This section contains documentation for all available providers for the **post_training** API.
|
||||
|
||||
- [inline::huggingface](inline_huggingface.md)
|
||||
- [inline::torchtune](inline_torchtune.md)
|
||||
- [remote::nvidia](remote_nvidia.md)
|
|
@ -0,0 +1,33 @@
|
|||
# inline::huggingface
|
||||
|
||||
## Description
|
||||
|
||||
HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `device` | `<class 'str'>` | No | cuda | |
|
||||
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
|
||||
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
|
||||
| `chat_template` | `<class 'str'>` | No | |
|
||||
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
|
||||
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
|
||||
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
|
||||
| `save_total_limit` | `<class 'int'>` | No | 3 | |
|
||||
| `logging_steps` | `<class 'int'>` | No | 10 | |
|
||||
| `warmup_ratio` | `<class 'float'>` | No | 0.1 | |
|
||||
| `weight_decay` | `<class 'float'>` | No | 0.01 | |
|
||||
| `dataloader_num_workers` | `<class 'int'>` | No | 4 | |
|
||||
| `dataloader_pin_memory` | `<class 'bool'>` | No | True | |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
checkpoint_format: huggingface
|
||||
distributed_backend: null
|
||||
device: cpu
|
||||
|
||||
```
|
||||
|
20
docs/source/advanced_apis/post_training/inline_torchtune.md
Normal file
20
docs/source/advanced_apis/post_training/inline_torchtune.md
Normal file
|
@ -0,0 +1,20 @@
|
|||
# inline::torchtune
|
||||
|
||||
## Description
|
||||
|
||||
TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `torch_seed` | `int \| None` | No | | |
|
||||
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
checkpoint_format: meta
|
||||
|
||||
```
|
||||
|
28
docs/source/advanced_apis/post_training/remote_nvidia.md
Normal file
28
docs/source/advanced_apis/post_training/remote_nvidia.md
Normal file
|
@ -0,0 +1,28 @@
|
|||
# remote::nvidia
|
||||
|
||||
## Description
|
||||
|
||||
NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `api_key` | `str \| None` | No | | The NVIDIA API key. |
|
||||
| `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
|
||||
| `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
|
||||
| `customizer_url` | `str \| None` | No | | Base URL for the NeMo Customizer API |
|
||||
| `timeout` | `<class 'int'>` | No | 300 | Timeout for the NVIDIA Post Training API |
|
||||
| `max_retries` | `<class 'int'>` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
|
||||
| `output_model_dir` | `<class 'str'>` | No | test-example-model@v1 | Directory to save the output model |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
api_key: ${env.NVIDIA_API_KEY:=}
|
||||
dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
|
||||
project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
|
||||
customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
|
||||
|
||||
```
|
||||
|
7
docs/source/advanced_apis/scoring/index.md
Normal file
7
docs/source/advanced_apis/scoring/index.md
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Scoring Providers
|
||||
|
||||
This section contains documentation for all available providers for the **scoring** API.
|
||||
|
||||
- [inline::basic](inline_basic.md)
|
||||
- [inline::braintrust](inline_braintrust.md)
|
||||
- [inline::llm-as-judge](inline_llm-as-judge.md)
|
13
docs/source/advanced_apis/scoring/inline_basic.md
Normal file
13
docs/source/advanced_apis/scoring/inline_basic.md
Normal file
|
@ -0,0 +1,13 @@
|
|||
# inline::basic
|
||||
|
||||
## Description
|
||||
|
||||
Basic scoring provider for simple evaluation metrics and scoring functions.
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
{}
|
||||
|
||||
```
|
||||
|
19
docs/source/advanced_apis/scoring/inline_braintrust.md
Normal file
19
docs/source/advanced_apis/scoring/inline_braintrust.md
Normal file
|
@ -0,0 +1,19 @@
|
|||
# inline::braintrust
|
||||
|
||||
## Description
|
||||
|
||||
Braintrust scoring provider for evaluation and scoring using the Braintrust platform.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `openai_api_key` | `str \| None` | No | | The OpenAI API Key |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
openai_api_key: ${env.OPENAI_API_KEY:=}
|
||||
|
||||
```
|
||||
|
13
docs/source/advanced_apis/scoring/inline_llm-as-judge.md
Normal file
13
docs/source/advanced_apis/scoring/inline_llm-as-judge.md
Normal file
|
@ -0,0 +1,13 @@
|
|||
# inline::llm-as-judge
|
||||
|
||||
## Description
|
||||
|
||||
LLM-as-judge scoring provider that uses language models to evaluate and score responses.
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
{}
|
||||
|
||||
```
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
# Building AI Applications (Examples)
|
||||
# AI Application Examples
|
||||
|
||||
Llama Stack provides all the building blocks needed to create sophisticated AI applications.
|
||||
|
||||
|
@ -27,4 +27,5 @@ tools
|
|||
evals
|
||||
telemetry
|
||||
safety
|
||||
```
|
||||
playground/index
|
||||
```
|
|
@ -1,4 +1,4 @@
|
|||
# Llama Stack Playground
|
||||
## Llama Stack Playground
|
||||
|
||||
```{note}
|
||||
The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
|
||||
|
@ -9,7 +9,7 @@ The Llama Stack Playground is an simple interface which aims to:
|
|||
- Demo **end-to-end** application code to help users get started to build their own applications
|
||||
- Provide an **UI** to help users inspect and understand Llama Stack API providers and resources
|
||||
|
||||
## Key Features
|
||||
### Key Features
|
||||
|
||||
#### Playground
|
||||
Interactive pages for users to play with and explore Llama Stack API capabilities.
|
||||
|
@ -90,7 +90,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
|
|||
- Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
|
||||
- Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
|
||||
|
||||
## Starting the Llama Stack Playground
|
||||
### Starting the Llama Stack Playground
|
||||
|
||||
To start the Llama Stack Playground, run the following commands:
|
||||
|
|
@ -1,31 +1,39 @@
|
|||
# Why Llama Stack?
|
||||
## Llama Stack architecture
|
||||
|
||||
Building production AI applications today requires solving multiple challenges:
|
||||
|
||||
**Infrastructure Complexity**
|
||||
- Running large language models efficiently requires specialized infrastructure.
|
||||
- Different deployment scenarios (local development, cloud, edge) need different solutions.
|
||||
- Moving from development to production often requires significant rework.
|
||||
|
||||
**Essential Capabilities**
|
||||
- Safety guardrails and content filtering are necessary in an enterprise setting.
|
||||
- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
|
||||
- Nearly any application needs composable multi-step workflows.
|
||||
- Finally, without monitoring, observability and evaluation, you end up operating in the dark.
|
||||
|
||||
**Lack of Flexibility and Choice**
|
||||
- Directly integrating with multiple providers creates tight coupling.
|
||||
- Different providers have different APIs and abstractions.
|
||||
- Changing providers requires significant code changes.
|
||||
|
||||
|
||||
### Our Solution: A Universal Stack
|
||||
Llama Stack allows you to build different layers of distributions for your AI workloads using various SDKs and API providers.
|
||||
|
||||
```{image} ../../_static/llama-stack.png
|
||||
:alt: Llama Stack
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
### Benefits of Llama stack
|
||||
|
||||
#### Current challenges in custom AI applications
|
||||
|
||||
Building production AI applications today requires solving multiple challenges:
|
||||
|
||||
Infrastructure Complexity
|
||||
|
||||
- Running large language models efficiently requires specialized infrastructure.
|
||||
- Different deployment scenarios (local development, cloud, edge) need different solutions.
|
||||
- Moving from development to production often requires significant rework.
|
||||
|
||||
**Essential Capabilities**
|
||||
|
||||
- Safety guardrails and content filtering are necessary in an enterprise setting.
|
||||
- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
|
||||
- Nearly any application needs composable multi-step workflows.
|
||||
- Without monitoring, observability and evaluation, you end up operating in the dark.
|
||||
|
||||
**Lack of Flexibility and Choice**
|
||||
|
||||
- Directly integrating with multiple providers creates tight coupling.
|
||||
- Different providers have different APIs and abstractions.
|
||||
- Changing providers requires significant code changes.
|
||||
|
||||
#### Our Solution: A Universal Stack
|
||||
|
||||
Llama Stack addresses these challenges through a service-oriented, API-first approach:
|
||||
|
||||
**Develop Anywhere, Deploy Everywhere**
|
||||
|
@ -59,4 +67,4 @@ Llama Stack addresses these challenges through a service-oriented, API-first app
|
|||
- **Turnkey Solutions**: Easy to deploy built in solutions for popular deployment scenarios
|
||||
|
||||
|
||||
With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
|
||||
With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
|
|
@ -2,6 +2,10 @@
|
|||
|
||||
Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.
|
||||
|
||||
```{include} architecture.md
|
||||
:start-after: ## Llama Stack architecture
|
||||
```
|
||||
|
||||
```{include} apis.md
|
||||
:start-after: ## APIs
|
||||
```
|
||||
|
@ -10,14 +14,10 @@ Given Llama Stack's service-oriented philosophy, a few concepts and workflows ar
|
|||
:start-after: ## API Providers
|
||||
```
|
||||
|
||||
```{include} resources.md
|
||||
:start-after: ## Resources
|
||||
```
|
||||
|
||||
```{include} distributions.md
|
||||
:start-after: ## Distributions
|
||||
```
|
||||
|
||||
```{include} evaluation_concepts.md
|
||||
:start-after: ## Evaluation Concepts
|
||||
```{include} resources.md
|
||||
:start-after: ## Resources
|
||||
```
|
||||
|
|
|
@ -52,7 +52,18 @@ extensions = [
|
|||
"sphinxcontrib.redoc",
|
||||
"sphinxcontrib.mermaid",
|
||||
"sphinxcontrib.video",
|
||||
"sphinx_reredirects"
|
||||
]
|
||||
|
||||
redirects = {
|
||||
"providers/post_training/index": "../../advanced_apis/post_training/index.html",
|
||||
"providers/eval/index": "../../advanced_apis/eval/index.html",
|
||||
"providers/scoring/index": "../../advanced_apis/scoring/index.html",
|
||||
"playground/index": "../../building_applications/playground/index.html",
|
||||
"openai/index": "../../providers/index.html#openai-api-compatibility",
|
||||
"introduction/index": "../concepts/index.html#llama-stack-architecture"
|
||||
}
|
||||
|
||||
myst_enable_extensions = ["colon_fence"]
|
||||
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
|
4
docs/source/deploying/index.md
Normal file
4
docs/source/deploying/index.md
Normal file
|
@ -0,0 +1,4 @@
|
|||
# Deployment Examples
|
||||
|
||||
```{include} kubernetes_deployment.md
|
||||
```
|
|
@ -1,4 +1,4 @@
|
|||
# Kubernetes Deployment Guide
|
||||
## Kubernetes Deployment Guide
|
||||
|
||||
Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
|
||||
|
|
@ -6,14 +6,9 @@ This section provides an overview of the distributions available in Llama Stack.
|
|||
|
||||
```{toctree}
|
||||
:maxdepth: 3
|
||||
|
||||
list_of_distributions
|
||||
building_distro
|
||||
customizing_run_yaml
|
||||
importing_as_library
|
||||
configuration
|
||||
customizing_run_yaml
|
||||
list_of_distributions
|
||||
kubernetes_deployment
|
||||
building_distro
|
||||
on_device_distro
|
||||
remote_hosted_distro
|
||||
self_hosted_distro
|
||||
```
|
||||
|
|
|
@ -28,5 +28,4 @@ If you have built a container image and want to deploy it in a Kubernetes cluste
|
|||
|
||||
importing_as_library
|
||||
configuration
|
||||
kubernetes_deployment
|
||||
```
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Detailed Tutorial
|
||||
## Detailed Tutorial
|
||||
|
||||
In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple agent.
|
||||
A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with
|
||||
|
@ -10,7 +10,7 @@ Llama Stack is a stateful service with REST APIs to support seamless transition
|
|||
In this guide, we'll walk through how to build a RAG agent locally using Llama Stack with [Ollama](https://ollama.com/)
|
||||
as the inference [provider](../providers/index.md#inference) for a Llama Model.
|
||||
|
||||
## Step 1: Installation and Setup
|
||||
### Step 1: Installation and Setup
|
||||
|
||||
Install Ollama by following the instructions on the [Ollama website](https://ollama.com/download), then
|
||||
download Llama 3.2 3B model, and then start the Ollama service.
|
||||
|
@ -45,7 +45,7 @@ Setup your virtual environment.
|
|||
uv sync --python 3.12
|
||||
source .venv/bin/activate
|
||||
```
|
||||
## Step 2: Run Llama Stack
|
||||
### Step 2: Run Llama Stack
|
||||
Llama Stack is a server that exposes multiple APIs, you connect with it using the Llama Stack client SDK.
|
||||
|
||||
::::{tab-set}
|
||||
|
@ -132,7 +132,7 @@ Now you can use the Llama Stack client to run inference and build agents!
|
|||
You can reuse the server setup or use the [Llama Stack Client](https://github.com/meta-llama/llama-stack-client-python/).
|
||||
Note that the client package is already included in the `llama-stack` package.
|
||||
|
||||
## Step 3: Run Client CLI
|
||||
### Step 3: Run Client CLI
|
||||
|
||||
Open a new terminal and navigate to the same directory you started the server from. Then set up a new or activate your
|
||||
existing server virtual environment.
|
||||
|
@ -232,7 +232,7 @@ OpenAIChatCompletion(
|
|||
)
|
||||
```
|
||||
|
||||
## Step 4: Run the Demos
|
||||
### Step 4: Run the Demos
|
||||
|
||||
Note that these demos show the [Python Client SDK](../references/python_sdk_reference/index.md).
|
||||
Other SDKs are also available, please refer to the [Client SDK](../index.md#client-sdks) list for the complete options.
|
||||
|
@ -242,7 +242,7 @@ Other SDKs are also available, please refer to the [Client SDK](../index.md#clie
|
|||
:::{tab-item} Basic Inference
|
||||
Now you can run inference using the Llama Stack client SDK.
|
||||
|
||||
### i. Create the Script
|
||||
#### i. Create the Script
|
||||
|
||||
Create a file `inference.py` and add the following code:
|
||||
```python
|
||||
|
@ -269,7 +269,7 @@ response = client.chat.completions.create(
|
|||
print(response)
|
||||
```
|
||||
|
||||
### ii. Run the Script
|
||||
#### ii. Run the Script
|
||||
Let's run the script using `uv`
|
||||
```bash
|
||||
uv run python inference.py
|
||||
|
@ -283,7 +283,7 @@ OpenAIChatCompletion(id='chatcmpl-30cd0f28-a2ad-4b6d-934b-13707fc60ebf', choices
|
|||
|
||||
:::{tab-item} Build a Simple Agent
|
||||
Next we can move beyond simple inference and build an agent that can perform tasks using the Llama Stack server.
|
||||
### i. Create the Script
|
||||
#### i. Create the Script
|
||||
Create a file `agent.py` and add the following code:
|
||||
|
||||
```python
|
||||
|
@ -455,7 +455,7 @@ uv run python agent.py
|
|||
|
||||
For our last demo, we can build a RAG agent that can answer questions about the Torchtune project using the documents
|
||||
in a vector database.
|
||||
### i. Create the Script
|
||||
#### i. Create the Script
|
||||
Create a file `rag_agent.py` and add the following code:
|
||||
|
||||
```python
|
||||
|
@ -533,7 +533,7 @@ for t in turns:
|
|||
for event in AgentEventLogger().log(stream):
|
||||
event.print()
|
||||
```
|
||||
### ii. Run the Script
|
||||
#### ii. Run the Script
|
||||
Let's run the script using `uv`
|
||||
```bash
|
||||
uv run python rag_agent.py
|
||||
|
|
|
@ -1,123 +1,13 @@
|
|||
# Quickstart
|
||||
# Getting Started
|
||||
|
||||
Get started with Llama Stack in minutes!
|
||||
|
||||
Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
|
||||
environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
|
||||
|
||||
In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
|
||||
as the inference [provider](../providers/inference/index) for a Llama Model.
|
||||
|
||||
**💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)
|
||||
|
||||
#### Step 1: Install and setup
|
||||
1. Install [uv](https://docs.astral.sh/uv/)
|
||||
2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
|
||||
```bash
|
||||
ollama run llama3.2:3b --keepalive 60m
|
||||
```{include} quickstart.md
|
||||
:start-after: ## Quickstart
|
||||
```
|
||||
#### Step 2: Run the Llama Stack server
|
||||
We will use `uv` to run the Llama Stack server.
|
||||
```bash
|
||||
INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
|
||||
|
||||
```{include} libraries.md
|
||||
:start-after: ## Libraries (SDKs)
|
||||
```
|
||||
#### Step 3: Run the demo
|
||||
Now open up a new terminal and copy the following script into a file named `demo_script.py`.
|
||||
|
||||
```python
|
||||
from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
|
||||
|
||||
vector_db_id = "my_demo_vector_db"
|
||||
client = LlamaStackClient(base_url="http://localhost:8321")
|
||||
|
||||
models = client.models.list()
|
||||
|
||||
# Select the first LLM and first embedding models
|
||||
model_id = next(m for m in models if m.model_type == "llm").identifier
|
||||
embedding_model_id = (
|
||||
em := next(m for m in models if m.model_type == "embedding")
|
||||
).identifier
|
||||
embedding_dimension = em.metadata["embedding_dimension"]
|
||||
|
||||
_ = client.vector_dbs.register(
|
||||
vector_db_id=vector_db_id,
|
||||
embedding_model=embedding_model_id,
|
||||
embedding_dimension=embedding_dimension,
|
||||
provider_id="faiss",
|
||||
)
|
||||
source = "https://www.paulgraham.com/greatwork.html"
|
||||
print("rag_tool> Ingesting document:", source)
|
||||
document = RAGDocument(
|
||||
document_id="document_1",
|
||||
content=source,
|
||||
mime_type="text/html",
|
||||
metadata={},
|
||||
)
|
||||
client.tool_runtime.rag_tool.insert(
|
||||
documents=[document],
|
||||
vector_db_id=vector_db_id,
|
||||
chunk_size_in_tokens=50,
|
||||
)
|
||||
agent = Agent(
|
||||
client,
|
||||
model=model_id,
|
||||
instructions="You are a helpful assistant",
|
||||
tools=[
|
||||
{
|
||||
"name": "builtin::rag/knowledge_search",
|
||||
"args": {"vector_db_ids": [vector_db_id]},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
prompt = "How do you do great work?"
|
||||
print("prompt>", prompt)
|
||||
|
||||
response = agent.create_turn(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
session_id=agent.create_session("rag_session"),
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for log in AgentEventLogger().log(response):
|
||||
log.print()
|
||||
```{include} detailed_tutorial.md
|
||||
:start-after: ## Detailed Tutorial
|
||||
```
|
||||
We will use `uv` to run the script
|
||||
```
|
||||
uv run --with llama-stack-client,fire,requests demo_script.py
|
||||
```
|
||||
And you should see output like below.
|
||||
```
|
||||
rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
|
||||
|
||||
prompt> How do you do great work?
|
||||
|
||||
inference> [knowledge_search(query="What is the key to doing great work")]
|
||||
|
||||
tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
|
||||
|
||||
tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
|
||||
|
||||
inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
|
||||
|
||||
To further clarify, I would suggest that doing great work involves:
|
||||
|
||||
* Completing tasks with high quality and attention to detail
|
||||
* Expanding on existing knowledge or ideas
|
||||
* Making a positive impact on others through your work
|
||||
* Striving for excellence and continuous improvement
|
||||
|
||||
Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
|
||||
```
|
||||
Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
|
||||
|
||||
## Next Steps
|
||||
|
||||
Now you're ready to dive deeper into Llama Stack!
|
||||
- Explore the [Detailed Tutorial](./detailed_tutorial.md).
|
||||
- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
|
||||
- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
|
||||
- Learn about Llama Stack [Concepts](../concepts/index.md).
|
||||
- Discover how to [Build Llama Stacks](../distributions/index.md).
|
||||
- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
|
||||
- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.
|
||||
|
|
10
docs/source/getting_started/libraries.md
Normal file
10
docs/source/getting_started/libraries.md
Normal file
|
@ -0,0 +1,10 @@
|
|||
## Libraries (SDKs)
|
||||
|
||||
We have a number of client-side SDKs available for different languages.
|
||||
|
||||
| **Language** | **Client SDK** | **Package** |
|
||||
| :----: | :----: | :----: |
|
||||
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [](https://pypi.org/project/llama_stack_client/)
|
||||
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
|
||||
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [](https://npmjs.org/package/llama-stack-client)
|
||||
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
|
123
docs/source/getting_started/quickstart.md
Normal file
123
docs/source/getting_started/quickstart.md
Normal file
|
@ -0,0 +1,123 @@
|
|||
## Quickstart
|
||||
|
||||
Get started with Llama Stack in minutes!
|
||||
|
||||
Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
|
||||
environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
|
||||
|
||||
In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
|
||||
as the inference [provider](../providers/inference/index) for a Llama Model.
|
||||
|
||||
**💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)
|
||||
|
||||
#### Step 1: Install and setup
|
||||
1. Install [uv](https://docs.astral.sh/uv/)
|
||||
2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
|
||||
```bash
|
||||
ollama run llama3.2:3b --keepalive 60m
|
||||
```
|
||||
#### Step 2: Run the Llama Stack server
|
||||
We will use `uv` to run the Llama Stack server.
|
||||
```bash
|
||||
INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
|
||||
```
|
||||
#### Step 3: Run the demo
|
||||
Now open up a new terminal and copy the following script into a file named `demo_script.py`.
|
||||
|
||||
```python
|
||||
from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
|
||||
|
||||
vector_db_id = "my_demo_vector_db"
|
||||
client = LlamaStackClient(base_url="http://localhost:8321")
|
||||
|
||||
models = client.models.list()
|
||||
|
||||
# Select the first LLM and first embedding models
|
||||
model_id = next(m for m in models if m.model_type == "llm").identifier
|
||||
embedding_model_id = (
|
||||
em := next(m for m in models if m.model_type == "embedding")
|
||||
).identifier
|
||||
embedding_dimension = em.metadata["embedding_dimension"]
|
||||
|
||||
_ = client.vector_dbs.register(
|
||||
vector_db_id=vector_db_id,
|
||||
embedding_model=embedding_model_id,
|
||||
embedding_dimension=embedding_dimension,
|
||||
provider_id="faiss",
|
||||
)
|
||||
source = "https://www.paulgraham.com/greatwork.html"
|
||||
print("rag_tool> Ingesting document:", source)
|
||||
document = RAGDocument(
|
||||
document_id="document_1",
|
||||
content=source,
|
||||
mime_type="text/html",
|
||||
metadata={},
|
||||
)
|
||||
client.tool_runtime.rag_tool.insert(
|
||||
documents=[document],
|
||||
vector_db_id=vector_db_id,
|
||||
chunk_size_in_tokens=50,
|
||||
)
|
||||
agent = Agent(
|
||||
client,
|
||||
model=model_id,
|
||||
instructions="You are a helpful assistant",
|
||||
tools=[
|
||||
{
|
||||
"name": "builtin::rag/knowledge_search",
|
||||
"args": {"vector_db_ids": [vector_db_id]},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
prompt = "How do you do great work?"
|
||||
print("prompt>", prompt)
|
||||
|
||||
response = agent.create_turn(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
session_id=agent.create_session("rag_session"),
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for log in AgentEventLogger().log(response):
|
||||
log.print()
|
||||
```
|
||||
We will use `uv` to run the script
|
||||
```
|
||||
uv run --with llama-stack-client,fire,requests demo_script.py
|
||||
```
|
||||
And you should see output like below.
|
||||
```
|
||||
rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
|
||||
|
||||
prompt> How do you do great work?
|
||||
|
||||
inference> [knowledge_search(query="What is the key to doing great work")]
|
||||
|
||||
tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
|
||||
|
||||
tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
|
||||
|
||||
inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
|
||||
|
||||
To further clarify, I would suggest that doing great work involves:
|
||||
|
||||
* Completing tasks with high quality and attention to detail
|
||||
* Expanding on existing knowledge or ideas
|
||||
* Making a positive impact on others through your work
|
||||
* Striving for excellence and continuous improvement
|
||||
|
||||
Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
|
||||
```
|
||||
Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
|
||||
|
||||
### Next Steps
|
||||
|
||||
Now you're ready to dive deeper into Llama Stack!
|
||||
- Explore the [Detailed Tutorial](./detailed_tutorial.md).
|
||||
- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
|
||||
- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
|
||||
- Learn about Llama Stack [Concepts](../concepts/index.md).
|
||||
- Discover how to [Build Llama Stacks](../distributions/index.md).
|
||||
- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
|
||||
- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.
|
|
@ -40,17 +40,6 @@ Kotlin.
|
|||
- Ready to build? Check out the [Quick Start](getting_started/index) to get started.
|
||||
- Want to contribute? See the [Contributing](contributing/index) guide.
|
||||
|
||||
## Client SDKs
|
||||
|
||||
We have a number of client-side SDKs available for different languages.
|
||||
|
||||
| **Language** | **Client SDK** | **Package** |
|
||||
| :----: | :----: | :----: |
|
||||
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [](https://pypi.org/project/llama_stack_client/)
|
||||
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
|
||||
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [](https://npmjs.org/package/llama-stack-client)
|
||||
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
|
||||
|
||||
## Supported Llama Stack Implementations
|
||||
|
||||
A number of "adapters" are available for some popular Inference and Vector Store providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs.
|
||||
|
@ -133,14 +122,12 @@ A number of "adapters" are available for some popular Inference and Vector Store
|
|||
|
||||
self
|
||||
getting_started/index
|
||||
getting_started/detailed_tutorial
|
||||
introduction/index
|
||||
concepts/index
|
||||
openai/index
|
||||
providers/index
|
||||
distributions/index
|
||||
advanced_apis/index
|
||||
building_applications/index
|
||||
playground/index
|
||||
deploying/index
|
||||
contributing/index
|
||||
references/index
|
||||
```
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Providers Overview
|
||||
# API Providers Overview
|
||||
|
||||
The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
|
||||
- LLM inference providers (e.g., Meta Reference, Ollama, Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, OpenAI, Anthropic, Gemini, WatsonX, etc.),
|
||||
|
@ -13,13 +13,25 @@ Providers come in two flavors:
|
|||
Importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.
|
||||
|
||||
## External Providers
|
||||
|
||||
Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
external
|
||||
external.md
|
||||
```
|
||||
|
||||
```{include} openai.md
|
||||
:start-after: ## OpenAI API Compatibility
|
||||
```
|
||||
|
||||
## Inference
|
||||
Runs inference with an LLM.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
inference/index
|
||||
```
|
||||
|
||||
## Agents
|
||||
|
@ -40,33 +52,6 @@ Interfaces with datasets and data loaders.
|
|||
datasetio/index
|
||||
```
|
||||
|
||||
## Eval
|
||||
Generates outputs (via Inference or Agents) and perform scoring.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
eval/index
|
||||
```
|
||||
|
||||
## Inference
|
||||
Runs inference with an LLM.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
inference/index
|
||||
```
|
||||
|
||||
## Post Training
|
||||
Fine-tunes a model.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
post_training/index
|
||||
```
|
||||
|
||||
## Safety
|
||||
Applies safety policies to the output at a Systems (not only model) level.
|
||||
|
||||
|
@ -76,15 +61,6 @@ Applies safety policies to the output at a Systems (not only model) level.
|
|||
safety/index
|
||||
```
|
||||
|
||||
## Scoring
|
||||
Evaluates the outputs of the system.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
scoring/index
|
||||
```
|
||||
|
||||
## Telemetry
|
||||
Collects telemetry data from the system.
|
||||
|
||||
|
@ -94,15 +70,6 @@ Collects telemetry data from the system.
|
|||
telemetry/index
|
||||
```
|
||||
|
||||
## Tool Runtime
|
||||
Is associated with the ToolGroup resouces.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
tool_runtime/index
|
||||
```
|
||||
|
||||
## Vector IO
|
||||
|
||||
Vector IO refers to operations on vector databases, such as adding documents, searching, and deleting documents.
|
||||
|
@ -114,3 +81,12 @@ io and database are used to store and retrieve documents for retrieval.
|
|||
|
||||
vector_io/index
|
||||
```
|
||||
|
||||
## Tool Runtime
|
||||
Is associated with the ToolGroup resources.
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 1
|
||||
|
||||
tool_runtime/index
|
||||
```
|
|
@ -1,14 +1,14 @@
|
|||
# OpenAI API Compatibility
|
||||
## OpenAI API Compatibility
|
||||
|
||||
## Server path
|
||||
### Server path
|
||||
|
||||
Llama Stack exposes an OpenAI-compatible API endpoint at `/v1/openai/v1`. So, for a Llama Stack server running locally on port `8321`, the full url to the OpenAI-compatible API endpoint is `http://localhost:8321/v1/openai/v1`.
|
||||
|
||||
## Clients
|
||||
### Clients
|
||||
|
||||
You should be able to use any client that speaks OpenAI APIs with Llama Stack. We regularly test with the official Llama Stack clients as well as OpenAI's official Python client.
|
||||
|
||||
### Llama Stack Client
|
||||
#### Llama Stack Client
|
||||
|
||||
When using the Llama Stack client, set the `base_url` to the root of your Llama Stack server. It will automatically route OpenAI-compatible requests to the right server endpoint for you.
|
||||
|
||||
|
@ -18,7 +18,7 @@ from llama_stack_client import LlamaStackClient
|
|||
client = LlamaStackClient(base_url="http://localhost:8321")
|
||||
```
|
||||
|
||||
### OpenAI Client
|
||||
#### OpenAI Client
|
||||
|
||||
When using an OpenAI client, set the `base_url` to the `/v1/openai/v1` path on your Llama Stack server.
|
||||
|
||||
|
@ -30,9 +30,9 @@ client = OpenAI(base_url="http://localhost:8321/v1/openai/v1", api_key="none")
|
|||
|
||||
Regardless of the client you choose, the following code examples should all work the same.
|
||||
|
||||
## APIs implemented
|
||||
### APIs implemented
|
||||
|
||||
### Models
|
||||
#### Models
|
||||
|
||||
Many of the APIs require you to pass in a model parameter. To see the list of models available in your Llama Stack server:
|
||||
|
||||
|
@ -40,13 +40,13 @@ Many of the APIs require you to pass in a model parameter. To see the list of mo
|
|||
models = client.models.list()
|
||||
```
|
||||
|
||||
### Responses
|
||||
#### Responses
|
||||
|
||||
:::{note}
|
||||
The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. We'd love feedback on any use-cases you try that do not work to help prioritize the pieces left to implement. Please open issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work.
|
||||
:::
|
||||
|
||||
#### Simple inference
|
||||
##### Simple inference
|
||||
|
||||
Request:
|
||||
|
||||
|
@ -66,7 +66,7 @@ Syntax whispers secrets sweet
|
|||
Code's gentle silence
|
||||
```
|
||||
|
||||
#### Structured Output
|
||||
##### Structured Output
|
||||
|
||||
Request:
|
||||
|
||||
|
@ -106,9 +106,9 @@ Example output:
|
|||
{ "participants": ["Alice", "Bob"] }
|
||||
```
|
||||
|
||||
### Chat Completions
|
||||
#### Chat Completions
|
||||
|
||||
#### Simple inference
|
||||
##### Simple inference
|
||||
|
||||
Request:
|
||||
|
||||
|
@ -129,7 +129,7 @@ Logic flows like a river
|
|||
Code's gentle beauty
|
||||
```
|
||||
|
||||
#### Structured Output
|
||||
##### Structured Output
|
||||
|
||||
Request:
|
||||
|
||||
|
@ -170,9 +170,9 @@ Example output:
|
|||
{ "participants": ["Alice", "Bob"] }
|
||||
```
|
||||
|
||||
### Completions
|
||||
#### Completions
|
||||
|
||||
#### Simple inference
|
||||
##### Simple inference
|
||||
|
||||
Request:
|
||||
|
|
@ -125,6 +125,7 @@ docs = [
|
|||
"sphinxcontrib.redoc",
|
||||
"sphinxcontrib.video",
|
||||
"sphinxcontrib.mermaid",
|
||||
"sphinx-reredirects",
|
||||
"tomli",
|
||||
"linkify",
|
||||
"sphinxcontrib.openapi",
|
||||
|
|
14
uv.lock
generated
14
uv.lock
generated
|
@ -1340,6 +1340,7 @@ docs = [
|
|||
{ name = "sphinx-autobuild" },
|
||||
{ name = "sphinx-copybutton" },
|
||||
{ name = "sphinx-design" },
|
||||
{ name = "sphinx-reredirects" },
|
||||
{ name = "sphinx-rtd-dark-mode" },
|
||||
{ name = "sphinx-rtd-theme" },
|
||||
{ name = "sphinx-tabs" },
|
||||
|
@ -1449,6 +1450,7 @@ docs = [
|
|||
{ name = "sphinx-autobuild" },
|
||||
{ name = "sphinx-copybutton" },
|
||||
{ name = "sphinx-design" },
|
||||
{ name = "sphinx-reredirects" },
|
||||
{ name = "sphinx-rtd-dark-mode" },
|
||||
{ name = "sphinx-rtd-theme" },
|
||||
{ name = "sphinx-tabs" },
|
||||
|
@ -3129,6 +3131,18 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/42/3d/6b41fe1637cd53c4b10d56e0e6f396546f837973dabf9c4b2a1de44620ac/sphinx_mdinclude-0.6.2-py3-none-any.whl", hash = "sha256:648e78edb067c0e4bffc22943278d49d54a0714494743592032fa3ad82a86984", size = 16911, upload-time = "2024-08-03T19:07:30.406Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-reredirects"
|
||||
version = "1.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "sphinx" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/72/61/d3039bc2b688c73e81f515afe771b7cc9631dfef63b3e3ac3aab3d73c685/sphinx_reredirects-1.0.0.tar.gz", hash = "sha256:7c9bada9f1330489fcf4c7297a2d6da2a49ca4877d3f42d1388ae1de1019bf5c", size = 711970, upload-time = "2025-05-31T14:45:55.428Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2a/7f/adb886a3db417a2ccea6a13dcb4b88d08f82104aed17e347346f79480a5f/sphinx_reredirects-1.0.0-py3-none-any.whl", hash = "sha256:1d0102710a8f633c6c885f940f440f7195ada675c1739976f0135790747dea06", size = 6173, upload-time = "2025-05-31T14:45:53.014Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sphinx-rtd-dark-mode"
|
||||
version = "1.3.0"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue