provider codegen fixes

This commit is contained in:
Alexey Rybak 2025-09-22 15:04:46 -07:00
parent 29d84570c3
commit 04bf9e6f80
80 changed files with 1875 additions and 433 deletions

View file

@ -1,7 +1,13 @@
--- ---
description: Available providers for the agents API description: "Agents API for creating and interacting with agentic systems.
sidebar_label: Overview
sidebar_position: 1 Main functionalities provided by this API:
- Create agents with specific instructions and ability to use tools.
- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
- Agents can be provided with various shields (see the Safety API for more details).
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details."
sidebar_label: Agents
title: Agents title: Agents
--- ---
@ -22,4 +28,4 @@ This section contains documentation for all available providers for the **agents
## Providers ## Providers
- **[Meta Reference](./inline_meta-reference)** - Inline provider - [Meta-Reference](./inline_meta-reference)

View file

@ -1,8 +1,6 @@
--- ---
description: Meta's reference implementation of an agent system that can use tools, description: "Meta's reference implementation of an agent system that can use tools, access vector databases, and perform complex reasoning tasks."
access vector databases, and perform complex reasoning tasks sidebar_label: Meta-Reference
sidebar_label: Meta Reference
sidebar_position: 2
title: inline::meta-reference title: inline::meta-reference
--- ---

View file

@ -1,7 +1,15 @@
--- ---
description: Available providers for the batches API description: "The Batches API enables efficient processing of multiple requests in a single operation,
sidebar_label: Overview particularly useful for processing large datasets, batch evaluation workflows, and
sidebar_position: 1 cost-effective inference at scale.
The API is designed to allow use of openai client libraries for seamless integration.
This API provides the following extensions:
- idempotent batch creation
Note: This API is currently under active development and may undergo changes."
sidebar_label: Batches
title: Batches title: Batches
--- ---
@ -24,4 +32,4 @@ This section contains documentation for all available providers for the **batche
## Providers ## Providers
- **[Reference](./inline_reference)** - Inline provider - [Reference](./inline_reference)

View file

@ -1,7 +1,6 @@
--- ---
description: Reference implementation of batches API with KVStore persistence description: "Reference implementation of batches API with KVStore persistence."
sidebar_label: Reference sidebar_label: Reference
sidebar_position: 2
title: inline::reference title: inline::reference
--- ---

View file

@ -1,7 +1,5 @@
--- ---
description: Available providers for the datasetio API sidebar_label: Datasetio
sidebar_label: Overview
sidebar_position: 1
title: Datasetio title: Datasetio
--- ---
@ -13,6 +11,6 @@ This section contains documentation for all available providers for the **datase
## Providers ## Providers
- **[Localfs](./inline_localfs)** - Inline provider - [Localfs](./inline_localfs)
- **[Huggingface](./remote_huggingface)** - Remote provider - [Remote - Huggingface](./remote_huggingface)
- **[Nvidia](./remote_nvidia)** - Remote provider - [Remote - Nvidia](./remote_nvidia)

View file

@ -1,8 +1,6 @@
--- ---
description: Local filesystem-based dataset I/O provider for reading and writing datasets description: "Local filesystem-based dataset I/O provider for reading and writing datasets to local storage."
to local storage
sidebar_label: Localfs sidebar_label: Localfs
sidebar_position: 2
title: inline::localfs title: inline::localfs
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: HuggingFace datasets provider for accessing and managing datasets from description: "HuggingFace datasets provider for accessing and managing datasets from the HuggingFace Hub."
the HuggingFace Hub sidebar_label: Remote - Huggingface
sidebar_label: Huggingface
sidebar_position: 3
title: remote::huggingface title: remote::huggingface
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data description: "NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data platform."
platform sidebar_label: Remote - Nvidia
sidebar_label: Nvidia
sidebar_position: 4
title: remote::nvidia title: remote::nvidia
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: Available providers for the eval API description: "Llama Stack Evaluation API for running evaluations on model and agent candidates."
sidebar_label: Overview sidebar_label: Eval
sidebar_position: 1
title: Eval title: Eval
--- ---
@ -15,5 +14,5 @@ This section contains documentation for all available providers for the **eval**
## Providers ## Providers
- **[Meta Reference](./inline_meta-reference)** - Inline provider - [Meta-Reference](./inline_meta-reference)
- **[Nvidia](./remote_nvidia)** - Remote provider - [Remote - Nvidia](./remote_nvidia)

View file

@ -1,8 +1,6 @@
--- ---
description: Meta's reference implementation of evaluation tasks with support for description: "Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics."
multiple languages and evaluation metrics sidebar_label: Meta-Reference
sidebar_label: Meta Reference
sidebar_position: 2
title: inline::meta-reference title: inline::meta-reference
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's description: "NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform."
platform sidebar_label: Remote - Nvidia
sidebar_label: Nvidia
sidebar_position: 3
title: remote::nvidia title: remote::nvidia
--- ---

View file

@ -1,7 +1,5 @@
--- ---
description: Available providers for the files API sidebar_label: Files
sidebar_label: Overview
sidebar_position: 1
title: Files title: Files
--- ---
@ -13,5 +11,5 @@ This section contains documentation for all available providers for the **files*
## Providers ## Providers
- **[Localfs](./inline_localfs)** - Inline provider - [Localfs](./inline_localfs)
- **[S3](./remote_s3)** - Remote provider - [Remote - S3](./remote_s3)

View file

@ -1,8 +1,6 @@
--- ---
description: Local filesystem-based file storage provider for managing files and documents description: "Local filesystem-based file storage provider for managing files and documents locally."
locally
sidebar_label: Localfs sidebar_label: Localfs
sidebar_position: 2
title: inline::localfs title: inline::localfs
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: AWS S3-based file storage provider for scalable cloud file management description: "AWS S3-based file storage provider for scalable cloud file management with metadata persistence."
with metadata persistence sidebar_label: Remote - S3
sidebar_label: S3
sidebar_position: 3
title: remote::s3 title: remote::s3
--- ---

View file

@ -1,7 +1,10 @@
--- ---
description: Available providers for the inference API description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
sidebar_label: Overview
sidebar_position: 1 This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
- Embedding models: these models generate embeddings to be used for semantic search."
sidebar_label: Inference
title: Inference title: Inference
--- ---
@ -19,28 +22,27 @@ This section contains documentation for all available providers for the **infere
## Providers ## Providers
- **[Meta Reference](./inline_meta-reference)** - Inline provider - [Meta-Reference](./inline_meta-reference)
- **[Sentence Transformers](./inline_sentence-transformers)** - Inline provider - [Sentence-Transformers](./inline_sentence-transformers)
- **[Anthropic](./remote_anthropic)** - Remote provider - [Remote - Anthropic](./remote_anthropic)
- **[Azure](./remote_azure)** - Remote provider - [Remote - Azure](./remote_azure)
- **[Bedrock](./remote_bedrock)** - Remote provider - [Remote - Bedrock](./remote_bedrock)
- **[Cerebras](./remote_cerebras)** - Remote provider - [Remote - Cerebras](./remote_cerebras)
- **[Databricks](./remote_databricks)** - Remote provider - [Remote - Databricks](./remote_databricks)
- **[Fireworks](./remote_fireworks)** - Remote provider - [Remote - Fireworks](./remote_fireworks)
- **[Gemini](./remote_gemini)** - Remote provider - [Remote - Gemini](./remote_gemini)
- **[Groq](./remote_groq)** - Remote provider - [Remote - Groq](./remote_groq)
- **[Hugging Face Endpoint](./remote_hf_endpoint)** - Remote provider - [Remote - Hf - Endpoint](./remote_hf_endpoint)
- **[Hugging Face Serverless](./remote_hf_serverless)** - Remote provider - [Remote - Hf - Serverless](./remote_hf_serverless)
- **[Llama OpenAI Compatible](./remote_llama-openai-compat)** - Remote provider - [Remote - Llama-Openai-Compat](./remote_llama-openai-compat)
- **[Nvidia](./remote_nvidia)** - Remote provider - [Remote - Nvidia](./remote_nvidia)
- **[Ollama](./remote_ollama)** - Remote provider - [Remote - Ollama](./remote_ollama)
- **[Openai](./remote_openai)** - Remote provider - [Remote - Openai](./remote_openai)
- **[Passthrough](./remote_passthrough)** - Remote provider - [Remote - Passthrough](./remote_passthrough)
- **[Runpod](./remote_runpod)** - Remote provider - [Remote - Runpod](./remote_runpod)
- **[Sambanova](./remote_sambanova)** - Remote provider - [Remote - Sambanova](./remote_sambanova)
- **[SambaNova OpenAI Compatible](./remote_sambanova-openai-compat)** - Remote provider - [Remote - Tgi](./remote_tgi)
- **[Tgi](./remote_tgi)** - Remote provider - [Remote - Together](./remote_together)
- **[Together](./remote_together)** - Remote provider - [Remote - Vertexai](./remote_vertexai)
- **[Vertexai](./remote_vertexai)** - Remote provider - [Remote - Vllm](./remote_vllm)
- **[Vllm](./remote_vllm)** - Remote provider - [Remote - Watsonx](./remote_watsonx)
- **[Watsonx](./remote_watsonx)** - Remote provider

View file

@ -1,8 +1,6 @@
--- ---
description: Meta's reference implementation of inference with support for various description: "Meta's reference implementation of inference with support for various model formats and optimization techniques."
model formats and optimization techniques sidebar_label: Meta-Reference
sidebar_label: Meta Reference
sidebar_position: 2
title: inline::meta-reference title: inline::meta-reference
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: Sentence Transformers inference provider for text embeddings and similarity description: "Sentence Transformers inference provider for text embeddings and similarity search."
search sidebar_label: Sentence-Transformers
sidebar_label: Sentence Transformers
sidebar_position: 3
title: inline::sentence-transformers title: inline::sentence-transformers
--- ---
@ -12,10 +10,6 @@ title: inline::sentence-transformers
Sentence Transformers inference provider for text embeddings and similarity search. Sentence Transformers inference provider for text embeddings and similarity search.
## Configuration
No configuration options available.
## Sample Configuration ## Sample Configuration
```yaml ```yaml

View file

@ -1,8 +1,6 @@
--- ---
description: Anthropic inference provider for accessing Claude models and Anthropic's description: "Anthropic inference provider for accessing Claude models and Anthropic's AI services."
AI services sidebar_label: Remote - Anthropic
sidebar_label: Anthropic
sidebar_position: 4
title: remote::anthropic title: remote::anthropic
--- ---

View file

@ -1,8 +1,9 @@
--- ---
description: Azure OpenAI inference provider for accessing GPT models and other Azure description: |
services Azure OpenAI inference provider for accessing GPT models and other Azure services.
sidebar_label: Azure Provider documentation
sidebar_position: 5 https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
sidebar_label: Remote - Azure
title: remote::azure title: remote::azure
--- ---
@ -10,10 +11,12 @@ title: remote::azure
## Description ## Description
Azure OpenAI inference provider for accessing GPT models and other Azure services. Azure OpenAI inference provider for accessing GPT models and other Azure services.
Provider documentation Provider documentation
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
## Configuration ## Configuration
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |

View file

@ -1,8 +1,6 @@
--- ---
description: AWS Bedrock inference provider for accessing various AI models through description: "AWS Bedrock inference provider for accessing various AI models through AWS's managed service."
AWS's managed service sidebar_label: Remote - Bedrock
sidebar_label: Bedrock
sidebar_position: 6
title: remote::bedrock title: remote::bedrock
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: Cerebras inference provider for running models on Cerebras Cloud platform description: "Cerebras inference provider for running models on Cerebras Cloud platform."
sidebar_label: Cerebras sidebar_label: Remote - Cerebras
sidebar_position: 7
title: remote::cerebras title: remote::cerebras
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: Databricks inference provider for running models on Databricks' unified description: "Databricks inference provider for running models on Databricks' unified analytics platform."
analytics platform sidebar_label: Remote - Databricks
sidebar_label: Databricks
sidebar_position: 8
title: remote::databricks title: remote::databricks
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: Fireworks AI inference provider for Llama models and other AI models description: "Fireworks AI inference provider for Llama models and other AI models on the Fireworks platform."
on the Fireworks platform sidebar_label: Remote - Fireworks
sidebar_label: Fireworks
sidebar_position: 9
title: remote::fireworks title: remote::fireworks
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: Google Gemini inference provider for accessing Gemini models and Google's description: "Google Gemini inference provider for accessing Gemini models and Google's AI services."
AI services sidebar_label: Remote - Gemini
sidebar_label: Gemini
sidebar_position: 10
title: remote::gemini title: remote::gemini
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: Groq inference provider for ultra-fast inference using Groq's LPU technology description: "Groq inference provider for ultra-fast inference using Groq's LPU technology."
sidebar_label: Groq sidebar_label: Remote - Groq
sidebar_position: 11
title: remote::groq title: remote::groq
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: HuggingFace Inference Endpoints provider for dedicated model serving description: "HuggingFace Inference Endpoints provider for dedicated model serving."
sidebar_label: Hugging Face Endpoint sidebar_label: Remote - Hf - Endpoint
sidebar_position: 12
title: remote::hf::endpoint title: remote::hf::endpoint
--- ---
@ -15,8 +14,8 @@ HuggingFace Inference Endpoints provider for dedicated model serving.
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `endpoint_name` | `<class 'str'>` | No | | The name of the Hugging Face Inference Endpoint in the format of `{namespace}/{endpoint_name}` (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. | | `endpoint_name` | `<class 'str'>` | No | | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
| `api_token` | `pydantic.types.SecretStr or None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) | | `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
## Sample Configuration ## Sample Configuration

View file

@ -1,7 +1,6 @@
--- ---
description: HuggingFace Inference API serverless provider for on-demand model inference description: "HuggingFace Inference API serverless provider for on-demand model inference."
sidebar_label: Hugging Face Serverless sidebar_label: Remote - Hf - Serverless
sidebar_position: 13
title: remote::hf::serverless title: remote::hf::serverless
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: Llama OpenAI-compatible provider for using Llama models with OpenAI API description: "Llama OpenAI-compatible provider for using Llama models with OpenAI API format."
format sidebar_label: Remote - Llama-Openai-Compat
sidebar_label: Llama OpenAI Compatible
sidebar_position: 14
title: remote::llama-openai-compat title: remote::llama-openai-compat
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: NVIDIA inference provider for accessing NVIDIA NIM models and AI services description: "NVIDIA inference provider for accessing NVIDIA NIM models and AI services."
sidebar_label: Nvidia sidebar_label: Remote - Nvidia
sidebar_position: 15
title: remote::nvidia title: remote::nvidia
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: Ollama inference provider for running local models through the Ollama description: "Ollama inference provider for running local models through the Ollama runtime."
runtime sidebar_label: Remote - Ollama
sidebar_label: Ollama
sidebar_position: 16
title: remote::ollama title: remote::ollama
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: OpenAI inference provider for accessing GPT models and other OpenAI services description: "OpenAI inference provider for accessing GPT models and other OpenAI services."
sidebar_label: Openai sidebar_label: Remote - Openai
sidebar_position: 17
title: remote::openai title: remote::openai
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: Passthrough inference provider for connecting to any external inference description: "Passthrough inference provider for connecting to any external inference service not directly supported."
service not directly supported sidebar_label: Remote - Passthrough
sidebar_label: Passthrough
sidebar_position: 18
title: remote::passthrough title: remote::passthrough
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: RunPod inference provider for running models on RunPod's cloud GPU platform description: "RunPod inference provider for running models on RunPod's cloud GPU platform."
sidebar_label: Runpod sidebar_label: Remote - Runpod
sidebar_position: 19
title: remote::runpod title: remote::runpod
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: SambaNova inference provider for running models on SambaNova's dataflow description: "SambaNova inference provider for running models on SambaNova's dataflow architecture."
architecture sidebar_label: Remote - Sambanova
sidebar_label: Sambanova
sidebar_position: 20
title: remote::sambanova title: remote::sambanova
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: Text Generation Inference (TGI) provider for HuggingFace model serving description: "Text Generation Inference (TGI) provider for HuggingFace model serving."
sidebar_label: Tgi sidebar_label: Remote - Tgi
sidebar_position: 22
title: remote::tgi title: remote::tgi
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: Together AI inference provider for open-source models and collaborative description: "Together AI inference provider for open-source models and collaborative AI development."
AI development sidebar_label: Remote - Together
sidebar_label: Together
sidebar_position: 23
title: remote::together title: remote::together
--- ---

View file

@ -1,18 +1,26 @@
--- ---
description: "Google Vertex AI inference provider enables you to use Google's Gemini\ description: |
\ models through Google Cloud's Vertex AI platform, providing several advantages:\n\ Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages:
\n\u2022 Enterprise-grade security: Uses Google Cloud's security controls and IAM\n\
\u2022 Better integration: Seamless integration with other Google Cloud services\n\ • Enterprise-grade security: Uses Google Cloud's security controls and IAM
\u2022 Advanced features: Access to additional Vertex AI features like model tuning\ • Better integration: Seamless integration with other Google Cloud services
\ and monitoring\n\u2022 Authentication: Uses Google Cloud Application Default Credentials\ • Advanced features: Access to additional Vertex AI features like model tuning and monitoring
\ (ADC) instead of API keys\n\nConfiguration:\n- Set VERTEX_AI_PROJECT environment\ • Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys
\ variable (required)\n- Set VERTEX_AI_LOCATION environment variable (optional,\
\ defaults to us-central1)\n- Use Google Cloud Application Default Credentials or\ Configuration:
\ service account key\n\nAuthentication Setup:\nOption 1 (Recommended): gcloud auth\ - Set VERTEX_AI_PROJECT environment variable (required)
\ application-default login\nOption 2: Set GOOGLE_APPLICATION_CREDENTIALS to service\ - Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1)
\ account key path\n\nAvailable Models:\n- vertex_ai/gemini-2" - Use Google Cloud Application Default Credentials or service account key
sidebar_label: Vertexai
sidebar_position: 24 Authentication Setup:
Option 1 (Recommended): gcloud auth application-default login
Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path
Available Models:
- vertex_ai/gemini-2.0-flash
- vertex_ai/gemini-2.5-flash
- vertex_ai/gemini-2.5-pro
sidebar_label: Remote - Vertexai
title: remote::vertexai title: remote::vertexai
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: Remote vLLM inference provider for connecting to vLLM servers description: "Remote vLLM inference provider for connecting to vLLM servers."
sidebar_label: Vllm sidebar_label: Remote - Vllm
sidebar_position: 25
title: remote::vllm title: remote::vllm
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: IBM WatsonX inference provider for accessing AI models on IBM's WatsonX description: "IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform."
platform sidebar_label: Remote - Watsonx
sidebar_label: Watsonx
sidebar_position: 26
title: remote::watsonx title: remote::watsonx
--- ---
@ -17,8 +15,8 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai | | `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The watsonx API key, only needed of using the hosted service | | `api_key` | `pydantic.types.SecretStr \| None` | No | | The watsonx API key |
| `project_id` | `str \| None` | No | | The Project ID key, only needed of using the hosted service | | `project_id` | `str \| None` | No | | The Project ID key |
| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests | | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
## Sample Configuration ## Sample Configuration

View file

@ -1,7 +1,5 @@
--- ---
description: Available providers for the post_training API sidebar_label: Post Training
sidebar_label: Overview
sidebar_position: 1
title: Post_Training title: Post_Training
--- ---
@ -13,10 +11,7 @@ This section contains documentation for all available providers for the **post_t
## Providers ## Providers
- **[Huggingface](./inline_huggingface)** - Inline provider - [Huggingface-Gpu](./inline_huggingface-gpu)
- **[Huggingface Cpu](./inline_huggingface-cpu)** - Inline provider - [Torchtune-Cpu](./inline_torchtune-cpu)
- **[Huggingface Gpu](./inline_huggingface-gpu)** - Inline provider - [Torchtune-Gpu](./inline_torchtune-gpu)
- **[Torchtune](./inline_torchtune)** - Inline provider - [Remote - Nvidia](./remote_nvidia)
- **[Torchtune Cpu](./inline_torchtune-cpu)** - Inline provider
- **[Torchtune Gpu](./inline_torchtune-gpu)** - Inline provider
- **[Nvidia](./remote_nvidia)** - Remote provider

View file

@ -1,8 +1,6 @@
--- ---
description: HuggingFace-based post-training provider for fine-tuning models using description: "HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem."
the HuggingFace ecosystem sidebar_label: Huggingface-Gpu
sidebar_label: Huggingface Gpu
sidebar_position: 4
title: inline::huggingface-gpu title: inline::huggingface-gpu
--- ---
@ -17,10 +15,13 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `device` | `<class 'str'>` | No | cuda | | | `device` | `<class 'str'>` | No | cuda | |
| `distributed_backend` | `Literal['fsdp', 'deepspeed']` | No | | | | `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
| `checkpoint_format` | `Literal['full_state', 'huggingface']` | No | huggingface | | | `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
| `chat_template` | `<class 'str'>` | No | `<\|user\|>{input}<\|assistant\|>{output}` | | | `chat_template` | `<class 'str'>` | No | <|user|>
| `model_specific_config` | `<class 'dict'>` | No | `{'trust_remote_code': True, 'attn_implementation': 'sdpa'}` | | {input}
<|assistant|>
{output} | |
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
| `max_seq_length` | `<class 'int'>` | No | 2048 | | | `max_seq_length` | `<class 'int'>` | No | 2048 | |
| `gradient_checkpointing` | `<class 'bool'>` | No | False | | | `gradient_checkpointing` | `<class 'bool'>` | No | False | |
| `save_total_limit` | `<class 'int'>` | No | 3 | | | `save_total_limit` | `<class 'int'>` | No | 3 | |

View file

@ -1,8 +1,6 @@
--- ---
description: TorchTune-based post-training provider for fine-tuning and optimizing description: "TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework."
models using Meta's TorchTune framework sidebar_label: Torchtune-Cpu
sidebar_label: Torchtune Cpu
sidebar_position: 6
title: inline::torchtune-cpu title: inline::torchtune-cpu
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: TorchTune-based post-training provider for fine-tuning and optimizing description: "TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework."
models using Meta's TorchTune framework sidebar_label: Torchtune-Gpu
sidebar_label: Torchtune Gpu
sidebar_position: 7
title: inline::torchtune-gpu title: inline::torchtune-gpu
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform description: "NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform."
sidebar_label: Nvidia sidebar_label: Remote - Nvidia
sidebar_position: 8
title: remote::nvidia title: remote::nvidia
--- ---

View file

@ -1,7 +1,5 @@
--- ---
description: Available providers for the safety API sidebar_label: Safety
sidebar_label: Overview
sidebar_position: 1
title: Safety title: Safety
--- ---
@ -13,9 +11,9 @@ This section contains documentation for all available providers for the **safety
## Providers ## Providers
- **[Code Scanner](./inline_code-scanner)** - Inline provider - [Code-Scanner](./inline_code-scanner)
- **[Llama Guard](./inline_llama-guard)** - Inline provider - [Llama-Guard](./inline_llama-guard)
- **[Prompt Guard](./inline_prompt-guard)** - Inline provider - [Prompt-Guard](./inline_prompt-guard)
- **[Bedrock](./remote_bedrock)** - Remote provider - [Remote - Bedrock](./remote_bedrock)
- **[Nvidia](./remote_nvidia)** - Remote provider - [Remote - Nvidia](./remote_nvidia)
- **[Sambanova](./remote_sambanova)** - Remote provider - [Remote - Sambanova](./remote_sambanova)

View file

@ -1,8 +1,6 @@
--- ---
description: Code Scanner safety provider for detecting security vulnerabilities and description: "Code Scanner safety provider for detecting security vulnerabilities and unsafe code patterns."
unsafe code patterns sidebar_label: Code-Scanner
sidebar_label: Code Scanner
sidebar_position: 2
title: inline::code-scanner title: inline::code-scanner
--- ---
@ -12,10 +10,6 @@ title: inline::code-scanner
Code Scanner safety provider for detecting security vulnerabilities and unsafe code patterns. Code Scanner safety provider for detecting security vulnerabilities and unsafe code patterns.
## Configuration
No configuration options available.
## Sample Configuration ## Sample Configuration
```yaml ```yaml

View file

@ -1,8 +1,6 @@
--- ---
description: Llama Guard safety provider for content moderation and safety filtering description: "Llama Guard safety provider for content moderation and safety filtering using Meta's Llama Guard model."
using Meta's Llama Guard model sidebar_label: Llama-Guard
sidebar_label: Llama Guard
sidebar_position: 3
title: inline::llama-guard title: inline::llama-guard
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: Prompt Guard safety provider for detecting and filtering unsafe prompts description: "Prompt Guard safety provider for detecting and filtering unsafe prompts and content."
and content sidebar_label: Prompt-Guard
sidebar_label: Prompt Guard
sidebar_position: 4
title: inline::prompt-guard title: inline::prompt-guard
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: AWS Bedrock safety provider for content moderation using AWS's safety description: "AWS Bedrock safety provider for content moderation using AWS's safety services."
services sidebar_label: Remote - Bedrock
sidebar_label: Bedrock
sidebar_position: 5
title: remote::bedrock title: remote::bedrock
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: NVIDIA's safety provider for content moderation and safety filtering description: "NVIDIA's safety provider for content moderation and safety filtering."
sidebar_label: Nvidia sidebar_label: Remote - Nvidia
sidebar_position: 6
title: remote::nvidia title: remote::nvidia
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: SambaNova's safety provider for content moderation and safety filtering description: "SambaNova's safety provider for content moderation and safety filtering."
sidebar_label: Sambanova sidebar_label: Remote - Sambanova
sidebar_position: 7
title: remote::sambanova title: remote::sambanova
--- ---

View file

@ -1,7 +1,5 @@
--- ---
description: Available providers for the scoring API sidebar_label: Scoring
sidebar_label: Overview
sidebar_position: 1
title: Scoring title: Scoring
--- ---
@ -13,6 +11,6 @@ This section contains documentation for all available providers for the **scorin
## Providers ## Providers
- **[Basic](./inline_basic)** - Inline provider - [Basic](./inline_basic)
- **[Braintrust](./inline_braintrust)** - Inline provider - [Braintrust](./inline_braintrust)
- **[Llm As Judge](./inline_llm-as-judge)** - Inline provider - [Llm-As-Judge](./inline_llm-as-judge)

View file

@ -1,7 +1,6 @@
--- ---
description: Basic scoring provider for simple evaluation metrics and scoring functions description: "Basic scoring provider for simple evaluation metrics and scoring functions."
sidebar_label: Basic sidebar_label: Basic
sidebar_position: 2
title: inline::basic title: inline::basic
--- ---
@ -11,10 +10,6 @@ title: inline::basic
Basic scoring provider for simple evaluation metrics and scoring functions. Basic scoring provider for simple evaluation metrics and scoring functions.
## Configuration
No configuration options available.
## Sample Configuration ## Sample Configuration
```yaml ```yaml

View file

@ -1,8 +1,6 @@
--- ---
description: Braintrust scoring provider for evaluation and scoring using the Braintrust description: "Braintrust scoring provider for evaluation and scoring using the Braintrust platform."
platform
sidebar_label: Braintrust sidebar_label: Braintrust
sidebar_position: 3
title: inline::braintrust title: inline::braintrust
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: LLM-as-judge scoring provider that uses language models to evaluate and description: "LLM-as-judge scoring provider that uses language models to evaluate and score responses."
score responses sidebar_label: Llm-As-Judge
sidebar_label: Llm As Judge
sidebar_position: 4
title: inline::llm-as-judge title: inline::llm-as-judge
--- ---
@ -12,10 +10,6 @@ title: inline::llm-as-judge
LLM-as-judge scoring provider that uses language models to evaluate and score responses. LLM-as-judge scoring provider that uses language models to evaluate and score responses.
## Configuration
No configuration options available.
## Sample Configuration ## Sample Configuration
```yaml ```yaml

View file

@ -1,7 +1,5 @@
--- ---
description: Available providers for the telemetry API sidebar_label: Telemetry
sidebar_label: Overview
sidebar_position: 1
title: Telemetry title: Telemetry
--- ---
@ -13,4 +11,4 @@ This section contains documentation for all available providers for the **teleme
## Providers ## Providers
- **[Meta Reference](./inline_meta-reference)** - Inline provider - [Meta-Reference](./inline_meta-reference)

View file

@ -1,8 +1,6 @@
--- ---
description: Meta's reference implementation of telemetry and observability using description: "Meta's reference implementation of telemetry and observability using OpenTelemetry."
OpenTelemetry sidebar_label: Meta-Reference
sidebar_label: Meta Reference
sidebar_position: 2
title: inline::meta-reference title: inline::meta-reference
--- ---
@ -16,9 +14,9 @@ Meta's reference implementation of telemetry and observability using OpenTelemet
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `otel_exporter_otlp_endpoint` | `str or None` | No | | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. | | `otel_exporter_otlp_endpoint` | `str \| None` | No | | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. |
| `service_name` | `<class 'str'>` | No | | The service name to use for telemetry | | `service_name` | `<class 'str'>` | No | | The service name to use for telemetry |
| `sinks` | `list[TelemetrySink]` | No | `[CONSOLE, SQLITE]` | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) | | `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [<TelemetrySink.CONSOLE: 'console'>, <TelemetrySink.SQLITE: 'sqlite'>] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) |
| `sqlite_db_path` | `<class 'str'>` | No | ~/.llama/runtime/trace_store.db | The path to the SQLite database to use for storing traces | | `sqlite_db_path` | `<class 'str'>` | No | ~/.llama/runtime/trace_store.db | The path to the SQLite database to use for storing traces |
## Sample Configuration ## Sample Configuration

View file

@ -1,7 +1,5 @@
--- ---
description: Available providers for the tool_runtime API sidebar_label: Tool Runtime
sidebar_label: Overview
sidebar_position: 1
title: Tool_Runtime title: Tool_Runtime
--- ---
@ -13,9 +11,9 @@ This section contains documentation for all available providers for the **tool_r
## Providers ## Providers
- **[Rag Runtime](./inline_rag-runtime)** - Inline provider - [Rag-Runtime](./inline_rag-runtime)
- **[Bing Search](./remote_bing-search)** - Remote provider - [Remote - Bing-Search](./remote_bing-search)
- **[Brave Search](./remote_brave-search)** - Remote provider - [Remote - Brave-Search](./remote_brave-search)
- **[Model Context Protocol](./remote_model-context-protocol)** - Remote provider - [Remote - Model-Context-Protocol](./remote_model-context-protocol)
- **[Tavily Search](./remote_tavily-search)** - Remote provider - [Remote - Tavily-Search](./remote_tavily-search)
- **[Wolfram Alpha](./remote_wolfram-alpha)** - Remote provider - [Remote - Wolfram-Alpha](./remote_wolfram-alpha)

View file

@ -1,8 +1,6 @@
--- ---
description: RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, description: "RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search."
chunking, and semantic search sidebar_label: Rag-Runtime
sidebar_label: Rag Runtime
sidebar_position: 2
title: inline::rag-runtime title: inline::rag-runtime
--- ---
@ -12,10 +10,6 @@ title: inline::rag-runtime
RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search. RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.
## Configuration
No configuration options available.
## Sample Configuration ## Sample Configuration
```yaml ```yaml

View file

@ -1,8 +1,6 @@
--- ---
description: Bing Search tool for web search capabilities using Microsoft's search description: "Bing Search tool for web search capabilities using Microsoft's search engine."
engine sidebar_label: Remote - Bing-Search
sidebar_label: Bing Search
sidebar_position: 3
title: remote::bing-search title: remote::bing-search
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: Brave Search tool for web search capabilities with privacy-focused results description: "Brave Search tool for web search capabilities with privacy-focused results."
sidebar_label: Brave Search sidebar_label: Remote - Brave-Search
sidebar_position: 4
title: remote::brave-search title: remote::brave-search
--- ---

View file

@ -1,8 +1,6 @@
--- ---
description: Model Context Protocol (MCP) tool for standardized tool calling and context description: "Model Context Protocol (MCP) tool for standardized tool calling and context management."
management sidebar_label: Remote - Model-Context-Protocol
sidebar_label: Model Context Protocol
sidebar_position: 5
title: remote::model-context-protocol title: remote::model-context-protocol
--- ---
@ -12,10 +10,6 @@ title: remote::model-context-protocol
Model Context Protocol (MCP) tool for standardized tool calling and context management. Model Context Protocol (MCP) tool for standardized tool calling and context management.
## Configuration
No configuration options available.
## Sample Configuration ## Sample Configuration
```yaml ```yaml

View file

@ -1,7 +1,6 @@
--- ---
description: Tavily Search tool for AI-optimized web search with structured results description: "Tavily Search tool for AI-optimized web search with structured results."
sidebar_label: Tavily Search sidebar_label: Remote - Tavily-Search
sidebar_position: 6
title: remote::tavily-search title: remote::tavily-search
--- ---

View file

@ -1,7 +1,6 @@
--- ---
description: Wolfram Alpha tool for computational knowledge and mathematical calculations description: "Wolfram Alpha tool for computational knowledge and mathematical calculations."
sidebar_label: Wolfram Alpha sidebar_label: Remote - Wolfram-Alpha
sidebar_position: 7
title: remote::wolfram-alpha title: remote::wolfram-alpha
--- ---

View file

@ -1,7 +1,5 @@
--- ---
description: Available providers for the vector_io API sidebar_label: Vector Io
sidebar_label: Overview
sidebar_position: 1
title: Vector_Io title: Vector_Io
--- ---
@ -13,15 +11,15 @@ This section contains documentation for all available providers for the **vector
## Providers ## Providers
- **[Chromadb](./inline_chromadb)** - Inline provider - [Chromadb](./inline_chromadb)
- **[Faiss](./inline_faiss)** - Inline provider - [Faiss](./inline_faiss)
- **[Meta Reference](./inline_meta-reference)** - Inline provider - [Meta-Reference](./inline_meta-reference)
- **[Milvus](./inline_milvus)** - Inline provider - [Milvus](./inline_milvus)
- **[Qdrant](./inline_qdrant)** - Inline provider - [Qdrant](./inline_qdrant)
- **[SQLite-Vec](./inline_sqlite-vec)** - Inline provider - [Sqlite-Vec](./inline_sqlite-vec)
- **[SQLite-Vec](./inline_sqlite_vec)** - Inline provider - [Sqlite Vec](./inline_sqlite_vec)
- **[Chromadb](./remote_chromadb)** - Remote provider - [Remote - Chromadb](./remote_chromadb)
- **[Milvus](./remote_milvus)** - Remote provider - [Remote - Milvus](./remote_milvus)
- **[Pgvector](./remote_pgvector)** - Remote provider - [Remote - Pgvector](./remote_pgvector)
- **[Qdrant](./remote_qdrant)** - Remote provider - [Remote - Qdrant](./remote_qdrant)
- **[Weaviate](./remote_weaviate)** - Remote provider - [Remote - Weaviate](./remote_weaviate)

View file

@ -1,7 +1,37 @@
--- ---
description: '[Chroma](https://www' description: |
[Chroma](https://www.trychroma.com/) is an inline and remote vector
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
That means you're not limited to storing vectors in memory or in a separate service.
## Features
Chroma supports:
- Store embeddings and their metadata
- Vector search
- Full-text search
- Document storage
- Metadata filtering
- Multi-modal retrieval
## Usage
To use Chrome in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use chroma.
3. Start storing and querying vectors.
## Installation
You can install chroma using pip:
```bash
pip install chromadb
```
## Documentation
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
sidebar_label: Chromadb sidebar_label: Chromadb
sidebar_position: 2
title: inline::chromadb title: inline::chromadb
--- ---
@ -9,10 +39,41 @@ title: inline::chromadb
## Description ## Description
[Chroma](https://www.trychroma.com/) is an inline and remote vector [Chroma](https://www.trychroma.com/) is an inline and remote vector
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database. database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
That means you're not limited to storing vectors in memory or in a separate service. That means you're not limited to storing vectors in memory or in a separate service.
## Features
Chroma supports:
- Store embeddings and their metadata
- Vector search
- Full-text search
- Document storage
- Metadata filtering
- Multi-modal retrieval
## Usage
To use Chrome in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use chroma.
3. Start storing and querying vectors.
## Installation
You can install chroma using pip:
```bash
pip install chromadb
```
## Documentation
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
## Configuration ## Configuration
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |

View file

@ -1,7 +1,46 @@
--- ---
description: '[Faiss](https://github' description: |
[Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It
allows you to store and query vectors directly in memory.
That means you'll get fast and efficient vector retrieval.
## Features
- Lightweight and easy to use
- Fully integrated with Llama Stack
- GPU support
- **Vector search** - FAISS supports pure vector similarity search using embeddings
## Search Modes
**Supported:**
- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
**Not Supported:**
- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
## Usage
To use Faiss in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use Faiss.
3. Start storing and querying vectors.
## Installation
You can install Faiss using pip:
```bash
pip install faiss-cpu
```
## Documentation
See [Faiss' documentation](https://faiss.ai/) or the [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki) for
more details about Faiss in general.
sidebar_label: Faiss sidebar_label: Faiss
sidebar_position: 3
title: inline::faiss title: inline::faiss
--- ---
@ -9,10 +48,49 @@ title: inline::faiss
## Description ## Description
[Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It [Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It
allows you to store and query vectors directly in memory. allows you to store and query vectors directly in memory.
That means you'll get fast and efficient vector retrieval. That means you'll get fast and efficient vector retrieval.
## Features
- Lightweight and easy to use
- Fully integrated with Llama Stack
- GPU support
- **Vector search** - FAISS supports pure vector similarity search using embeddings
## Search Modes
**Supported:**
- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
**Not Supported:**
- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
## Usage
To use Faiss in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use Faiss.
3. Start storing and querying vectors.
## Installation
You can install Faiss using pip:
```bash
pip install faiss-cpu
```
## Documentation
See [Faiss' documentation](https://faiss.ai/) or the [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki) for
more details about Faiss in general.
## Configuration ## Configuration
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |

View file

@ -1,7 +1,6 @@
--- ---
description: Meta's reference implementation of a vector database description: "Meta's reference implementation of a vector database."
sidebar_label: Meta Reference sidebar_label: Meta-Reference
sidebar_position: 4
title: inline::meta-reference title: inline::meta-reference
--- ---
@ -24,3 +23,9 @@ kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db
``` ```
## Deprecation Notice
:::warning
Please use the `inline::faiss` provider instead.
:::

View file

@ -1,7 +1,6 @@
--- ---
description: Please refer to the remote provider documentation description: "Please refer to the remote provider documentation."
sidebar_label: Milvus sidebar_label: Milvus
sidebar_position: 5
title: inline::milvus title: inline::milvus
--- ---
@ -9,8 +8,10 @@ title: inline::milvus
## Description ## Description
Please refer to the remote provider documentation. Please refer to the remote provider documentation.
## Configuration ## Configuration
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |

View file

@ -1,7 +1,47 @@
--- ---
description: '[Qdrant](https://qdrant' description: |
[Qdrant](https://qdrant.tech/documentation/) is an inline and remote vector database provider for Llama Stack. It
allows you to store and query vectors directly in memory.
That means you'll get fast and efficient vector retrieval.
> By default, Qdrant stores vectors in RAM, delivering incredibly fast access for datasets that fit comfortably in
> memory. But when your dataset exceeds RAM capacity, Qdrant offers Memmap as an alternative.
>
> \[[An Introduction to Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database/)\]
## Features
- Lightweight and easy to use
- Fully integrated with Llama Stack
- Apache 2.0 license terms
- Store embeddings and their metadata
- Supports search by
[Keyword](https://qdrant.tech/articles/qdrant-introduces-full-text-filters-and-indexes/)
and [Hybrid](https://qdrant.tech/articles/hybrid-search/#building-a-hybrid-search-system-in-qdrant) search
- [Multilingual and Multimodal retrieval](https://qdrant.tech/documentation/multimodal-search/)
- [Medatata filtering](https://qdrant.tech/articles/vector-search-filtering/)
- [GPU support](https://qdrant.tech/documentation/guides/running-with-gpu/)
## Usage
To use Qdrant in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use Qdrant.
3. Start storing and querying vectors.
## Installation
You can install Qdrant using docker:
```bash
docker pull qdrant/qdrant
```
## Documentation
See the [Qdrant documentation](https://qdrant.tech/documentation/) for more details about Qdrant in general.
sidebar_label: Qdrant sidebar_label: Qdrant
sidebar_position: 6
title: inline::qdrant title: inline::qdrant
--- ---
@ -9,6 +49,7 @@ title: inline::qdrant
## Description ## Description
[Qdrant](https://qdrant.tech/documentation/) is an inline and remote vector database provider for Llama Stack. It [Qdrant](https://qdrant.tech/documentation/) is an inline and remote vector database provider for Llama Stack. It
allows you to store and query vectors directly in memory. allows you to store and query vectors directly in memory.
That means you'll get fast and efficient vector retrieval. That means you'll get fast and efficient vector retrieval.
@ -18,6 +59,40 @@ That means you'll get fast and efficient vector retrieval.
> >
> \[[An Introduction to Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database/)\] > \[[An Introduction to Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database/)\]
## Features
- Lightweight and easy to use
- Fully integrated with Llama Stack
- Apache 2.0 license terms
- Store embeddings and their metadata
- Supports search by
[Keyword](https://qdrant.tech/articles/qdrant-introduces-full-text-filters-and-indexes/)
and [Hybrid](https://qdrant.tech/articles/hybrid-search/#building-a-hybrid-search-system-in-qdrant) search
- [Multilingual and Multimodal retrieval](https://qdrant.tech/documentation/multimodal-search/)
- [Medatata filtering](https://qdrant.tech/articles/vector-search-filtering/)
- [GPU support](https://qdrant.tech/documentation/guides/running-with-gpu/)
## Usage
To use Qdrant in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use Qdrant.
3. Start storing and querying vectors.
## Installation
You can install Qdrant using docker:
```bash
docker pull qdrant/qdrant
```
## Documentation
See the [Qdrant documentation](https://qdrant.tech/documentation/) for more details about Qdrant in general.
## Configuration ## Configuration
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |

View file

@ -1,7 +1,202 @@
--- ---
description: '[SQLite-Vec](https://github' description: |
sidebar_label: SQLite-Vec [SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It
sidebar_position: 7 allows you to store and query vectors directly within an SQLite database.
That means you're not limited to storing vectors in memory or in a separate service.
## Features
- Lightweight and easy to use
- Fully integrated with Llama Stacks
- Uses disk-based storage for persistence, allowing for larger vector storage
### Comparison to Faiss
The choice between Faiss and sqlite-vec should be made based on the needs of your application,
as they have different strengths.
#### Choosing the Right Provider
Scenario | Recommended Tool | Reason
-- |-----------------| --
Online Analytical Processing (OLAP) | Faiss | Fast, in-memory searches
Online Transaction Processing (OLTP) | sqlite-vec | Frequent writes and reads
Frequent writes | sqlite-vec | Efficient disk-based storage and incremental indexing
Large datasets | sqlite-vec | Disk-based storage for larger vector storage
Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
#### Empirical Example
Consider the histogram below in which 10,000 randomly generated strings were inserted
in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
:alt: Comparison of SQLite-Vec and Faiss write times
:width: 400px
```
You will notice that the average write time for `sqlite-vec` was 788ms, compared to
47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
uniformly spread across the [1500, 100000] interval.
Looking at each individual write in the order that the documents are inserted you'll see the increase in
write speed as Faiss reindexes the vectors after each write.
```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
:alt: Comparison of SQLite-Vec and Faiss write times
:width: 400px
```
In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
The modes of the two distributions highlight the differences much further where Faiss
will likely yield faster read performance.
```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
:alt: Comparison of SQLite-Vec and Faiss read times
:width: 400px
```
## Usage
To use sqlite-vec in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use SQLite-Vec.
3. Start storing and querying vectors.
The SQLite-vec provider supports three search modes:
1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings.
2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5.
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates.
Example with hybrid search:
```python
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
)
# Using RRF ranker
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={
"mode": "hybrid",
"max_chunks": 3,
"score_threshold": 0.7,
"ranker": {"type": "rrf", "impact_factor": 60.0},
},
)
# Using weighted ranker
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={
"mode": "hybrid",
"max_chunks": 3,
"score_threshold": 0.7,
"ranker": {"type": "weighted", "alpha": 0.7}, # 70% vector, 30% keyword
},
)
```
Example with explicit vector search:
```python
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
)
```
Example with keyword search:
```python
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
)
```
## Supported Search Modes
The SQLite vector store supports three search modes:
1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks
2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker
### Hybrid Search
Hybrid search combines the strengths of both vector and keyword search by:
- Computing vector similarity scores
- Computing keyword match scores
- Using a ranker to combine these scores
Two ranker types are supported:
1. **RRF (Reciprocal Rank Fusion)**:
- Combines ranks from both vector and keyword results
- Uses an impact factor (default: 60.0) to control the weight of higher-ranked results
- Good for balancing between vector and keyword results
- The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks
2. **Weighted**:
- Linearly combines normalized vector and keyword scores
- Uses an alpha parameter (0-1) to control the blend:
- alpha=0: Only use keyword scores
- alpha=1: Only use vector scores
- alpha=0.5: Equal weight to both (default)
Example using RAGQueryConfig with different search modes:
```python
from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
# Vector search
config = RAGQueryConfig(mode="vector", max_chunks=5)
# Keyword search
config = RAGQueryConfig(mode="keyword", max_chunks=5)
# Hybrid search with custom RRF ranker
config = RAGQueryConfig(
mode="hybrid",
max_chunks=5,
ranker=RRFRanker(impact_factor=50.0), # Custom impact factor
)
# Hybrid search with weighted ranker
config = RAGQueryConfig(
mode="hybrid",
max_chunks=5,
ranker=WeightedRanker(alpha=0.7), # 70% vector, 30% keyword
)
# Hybrid search with default RRF ranker
config = RAGQueryConfig(
mode="hybrid", max_chunks=5
) # Will use RRF with impact_factor=60.0
```
Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored.
## Installation
You can install SQLite-Vec using pip:
```bash
pip install sqlite-vec
```
## Documentation
See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759).
sidebar_label: Sqlite-Vec
title: inline::sqlite-vec title: inline::sqlite-vec
--- ---
@ -9,10 +204,205 @@ title: inline::sqlite-vec
## Description ## Description
[SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It [SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It
allows you to store and query vectors directly within an SQLite database. allows you to store and query vectors directly within an SQLite database.
That means you're not limited to storing vectors in memory or in a separate service. That means you're not limited to storing vectors in memory or in a separate service.
## Features
- Lightweight and easy to use
- Fully integrated with Llama Stacks
- Uses disk-based storage for persistence, allowing for larger vector storage
### Comparison to Faiss
The choice between Faiss and sqlite-vec should be made based on the needs of your application,
as they have different strengths.
#### Choosing the Right Provider
Scenario | Recommended Tool | Reason
-- |-----------------| --
Online Analytical Processing (OLAP) | Faiss | Fast, in-memory searches
Online Transaction Processing (OLTP) | sqlite-vec | Frequent writes and reads
Frequent writes | sqlite-vec | Efficient disk-based storage and incremental indexing
Large datasets | sqlite-vec | Disk-based storage for larger vector storage
Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
#### Empirical Example
Consider the histogram below in which 10,000 randomly generated strings were inserted
in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
:alt: Comparison of SQLite-Vec and Faiss write times
:width: 400px
```
You will notice that the average write time for `sqlite-vec` was 788ms, compared to
47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
uniformly spread across the [1500, 100000] interval.
Looking at each individual write in the order that the documents are inserted you'll see the increase in
write speed as Faiss reindexes the vectors after each write.
```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
:alt: Comparison of SQLite-Vec and Faiss write times
:width: 400px
```
In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
The modes of the two distributions highlight the differences much further where Faiss
will likely yield faster read performance.
```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
:alt: Comparison of SQLite-Vec and Faiss read times
:width: 400px
```
## Usage
To use sqlite-vec in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use SQLite-Vec.
3. Start storing and querying vectors.
The SQLite-vec provider supports three search modes:
1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings.
2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5.
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates.
Example with hybrid search:
```python
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
)
# Using RRF ranker
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={
"mode": "hybrid",
"max_chunks": 3,
"score_threshold": 0.7,
"ranker": {"type": "rrf", "impact_factor": 60.0},
},
)
# Using weighted ranker
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={
"mode": "hybrid",
"max_chunks": 3,
"score_threshold": 0.7,
"ranker": {"type": "weighted", "alpha": 0.7}, # 70% vector, 30% keyword
},
)
```
Example with explicit vector search:
```python
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
)
```
Example with keyword search:
```python
response = await vector_io.query_chunks(
vector_db_id="my_db",
query="your query here",
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
)
```
## Supported Search Modes
The SQLite vector store supports three search modes:
1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks
2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker
### Hybrid Search
Hybrid search combines the strengths of both vector and keyword search by:
- Computing vector similarity scores
- Computing keyword match scores
- Using a ranker to combine these scores
Two ranker types are supported:
1. **RRF (Reciprocal Rank Fusion)**:
- Combines ranks from both vector and keyword results
- Uses an impact factor (default: 60.0) to control the weight of higher-ranked results
- Good for balancing between vector and keyword results
- The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks
2. **Weighted**:
- Linearly combines normalized vector and keyword scores
- Uses an alpha parameter (0-1) to control the blend:
- alpha=0: Only use keyword scores
- alpha=1: Only use vector scores
- alpha=0.5: Equal weight to both (default)
Example using RAGQueryConfig with different search modes:
```python
from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
# Vector search
config = RAGQueryConfig(mode="vector", max_chunks=5)
# Keyword search
config = RAGQueryConfig(mode="keyword", max_chunks=5)
# Hybrid search with custom RRF ranker
config = RAGQueryConfig(
mode="hybrid",
max_chunks=5,
ranker=RRFRanker(impact_factor=50.0), # Custom impact factor
)
# Hybrid search with weighted ranker
config = RAGQueryConfig(
mode="hybrid",
max_chunks=5,
ranker=WeightedRanker(alpha=0.7), # 70% vector, 30% keyword
)
# Hybrid search with default RRF ranker
config = RAGQueryConfig(
mode="hybrid", max_chunks=5
) # Will use RRF with impact_factor=60.0
```
Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored.
## Installation
You can install SQLite-Vec using pip:
```bash
pip install sqlite-vec
```
## Documentation
See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759).
## Configuration ## Configuration
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |

View file

@ -0,0 +1,35 @@
---
description: "Please refer to the sqlite-vec provider documentation."
sidebar_label: Sqlite Vec
title: inline::sqlite_vec
---
# inline::sqlite_vec
## Description
Please refer to the sqlite-vec provider documentation.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | | Path to the SQLite database file |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
## Sample Configuration
```yaml
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db
```
## Deprecation Notice
:::warning
Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
:::

View file

@ -1,7 +1,37 @@
--- ---
description: '[Chroma](https://www' description: |
sidebar_label: Chromadb [Chroma](https://www.trychroma.com/) is an inline and remote vector
sidebar_position: 9 database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
That means you're not limited to storing vectors in memory or in a separate service.
## Features
Chroma supports:
- Store embeddings and their metadata
- Vector search
- Full-text search
- Document storage
- Metadata filtering
- Multi-modal retrieval
## Usage
To use Chrome in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use chroma.
3. Start storing and querying vectors.
## Installation
You can install chroma using pip:
```bash
pip install chromadb
```
## Documentation
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
sidebar_label: Remote - Chromadb
title: remote::chromadb title: remote::chromadb
--- ---
@ -9,10 +39,40 @@ title: remote::chromadb
## Description ## Description
[Chroma](https://www.trychroma.com/) is an inline and remote vector [Chroma](https://www.trychroma.com/) is an inline and remote vector
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database. database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
That means you're not limited to storing vectors in memory or in a separate service. That means you're not limited to storing vectors in memory or in a separate service.
## Features
Chroma supports:
- Store embeddings and their metadata
- Vector search
- Full-text search
- Document storage
- Metadata filtering
- Multi-modal retrieval
## Usage
To use Chrome in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use chroma.
3. Start storing and querying vectors.
## Installation
You can install chroma using pip:
```bash
pip install chromadb
```
## Documentation
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
## Configuration ## Configuration
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |

View file

@ -1,18 +1,37 @@
--- ---
description: '[Milvus](https://milvus' description: |
sidebar_label: Milvus
sidebar_position: 10
title: remote::milvus
---
# remote::milvus
## Description
[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It [Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
allows you to store and query vectors directly within a Milvus database. allows you to store and query vectors directly within a Milvus database.
That means you're not limited to storing vectors in memory or in a separate service. That means you're not limited to storing vectors in memory or in a separate service.
## Features
- Easy to use
- Fully integrated with Llama Stack
- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
## Usage
To use Milvus in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use Milvus.
3. Start storing and querying vectors.
## Installation
If you want to use inline Milvus, you can install:
```bash
pip install pymilvus[milvus-lite]
```
If you want to use remote Milvus, you can install:
```bash
pip install pymilvus
```
## Configuration ## Configuration
In Llama Stack, Milvus can be configured in two ways: In Llama Stack, Milvus can be configured in two ways:
@ -86,6 +105,316 @@ vector_io:
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS). - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
- **`client_key_path`**: Path to the **client private key** file (required for mTLS). - **`client_key_path`**: Path to the **client private key** file (required for mTLS).
## Search Modes
Milvus supports three different search modes for both inline and remote configurations:
### Vector Search
Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
```python
# Vector search example
search_response = client.vector_stores.search(
vector_store_id=vector_store.id,
query="What is machine learning?",
search_mode="vector",
max_num_results=5,
)
```
### Keyword Search
Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
```python
# Keyword search example
search_response = client.vector_stores.search(
vector_store_id=vector_store.id,
query="Python programming language",
search_mode="keyword",
max_num_results=5,
)
```
### Hybrid Search
Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
#### Basic Hybrid Search
```python
# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
search_response = client.vector_stores.search(
vector_store_id=vector_store.id,
query="neural networks in Python",
search_mode="hybrid",
max_num_results=5,
)
```
**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
```python
# Hybrid search with custom RRF parameters
search_response = client.vector_stores.search(
vector_store_id=vector_store.id,
query="neural networks in Python",
search_mode="hybrid",
max_num_results=5,
ranking_options={
"ranker": {
"type": "rrf",
"impact_factor": 100.0, # Higher values give more weight to top-ranked results
}
},
)
```
#### Hybrid Search with Weighted Ranker
Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
```python
# Hybrid search with weighted ranker
search_response = client.vector_stores.search(
vector_store_id=vector_store.id,
query="neural networks in Python",
search_mode="hybrid",
max_num_results=5,
ranking_options={
"ranker": {
"type": "weighted",
"alpha": 0.7, # 70% vector search, 30% keyword search
}
},
)
```
For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
## Documentation
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
For more details on TLS configuration, refer to the [TLS setup guide](https://milvus.io/docs/tls.md).
sidebar_label: Remote - Milvus
title: remote::milvus
---
# remote::milvus
## Description
[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
allows you to store and query vectors directly within a Milvus database.
That means you're not limited to storing vectors in memory or in a separate service.
## Features
- Easy to use
- Fully integrated with Llama Stack
- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
## Usage
To use Milvus in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use Milvus.
3. Start storing and querying vectors.
## Installation
If you want to use inline Milvus, you can install:
```bash
pip install pymilvus[milvus-lite]
```
If you want to use remote Milvus, you can install:
```bash
pip install pymilvus
```
## Configuration
In Llama Stack, Milvus can be configured in two ways:
- **Inline (Local) Configuration** - Uses Milvus-Lite for local storage
- **Remote Configuration** - Connects to a remote Milvus server
### Inline (Local) Configuration
The simplest method is local configuration, which requires setting `db_path`, a path for locally storing Milvus-Lite files:
```yaml
vector_io:
- provider_id: milvus
provider_type: inline::milvus
config:
db_path: ~/.llama/distributions/together/milvus_store.db
```
### Remote Configuration
Remote configuration is suitable for larger data storage requirements:
#### Standard Remote Connection
```yaml
vector_io:
- provider_id: milvus
provider_type: remote::milvus
config:
uri: "http://<host>:<port>"
token: "<user>:<password>"
```
#### TLS-Enabled Remote Connection (One-way TLS)
For connections to Milvus instances with one-way TLS enabled:
```yaml
vector_io:
- provider_id: milvus
provider_type: remote::milvus
config:
uri: "https://<host>:<port>"
token: "<user>:<password>"
secure: True
server_pem_path: "/path/to/server.pem"
```
#### Mutual TLS (mTLS) Remote Connection
For connections to Milvus instances with mutual TLS (mTLS) enabled:
```yaml
vector_io:
- provider_id: milvus
provider_type: remote::milvus
config:
uri: "https://<host>:<port>"
token: "<user>:<password>"
secure: True
ca_pem_path: "/path/to/ca.pem"
client_pem_path: "/path/to/client.pem"
client_key_path: "/path/to/client.key"
```
#### Key Parameters for TLS Configuration
- **`secure`**: Enables TLS encryption when set to `true`. Defaults to `false`.
- **`server_pem_path`**: Path to the **server certificate** for verifying the server's identity (used in one-way TLS).
- **`ca_pem_path`**: Path to the **Certificate Authority (CA) certificate** for validating the server certificate (required in mTLS).
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
## Search Modes
Milvus supports three different search modes for both inline and remote configurations:
### Vector Search
Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
```python
# Vector search example
search_response = client.vector_stores.search(
vector_store_id=vector_store.id,
query="What is machine learning?",
search_mode="vector",
max_num_results=5,
)
```
### Keyword Search
Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
```python
# Keyword search example
search_response = client.vector_stores.search(
vector_store_id=vector_store.id,
query="Python programming language",
search_mode="keyword",
max_num_results=5,
)
```
### Hybrid Search
Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
#### Basic Hybrid Search
```python
# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
search_response = client.vector_stores.search(
vector_store_id=vector_store.id,
query="neural networks in Python",
search_mode="hybrid",
max_num_results=5,
)
```
**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
```python
# Hybrid search with custom RRF parameters
search_response = client.vector_stores.search(
vector_store_id=vector_store.id,
query="neural networks in Python",
search_mode="hybrid",
max_num_results=5,
ranking_options={
"ranker": {
"type": "rrf",
"impact_factor": 100.0, # Higher values give more weight to top-ranked results
}
},
)
```
#### Hybrid Search with Weighted Ranker
Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
```python
# Hybrid search with weighted ranker
search_response = client.vector_stores.search(
vector_store_id=vector_store.id,
query="neural networks in Python",
search_mode="hybrid",
max_num_results=5,
ranking_options={
"ranker": {
"type": "weighted",
"alpha": 0.7, # 70% vector search, 30% keyword search
}
},
)
```
For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
## Documentation
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
For more details on TLS configuration, refer to the [TLS setup guide](https://milvus.io/docs/tls.md).
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `uri` | `<class 'str'>` | No | | The URI of the Milvus server |
| `token` | `str \| None` | No | | The token of the Milvus server |
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
| `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
:::note
This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
:::
## Sample Configuration ## Sample Configuration
```yaml ```yaml

View file

@ -1,7 +1,105 @@
--- ---
description: '[PGVector](https://github' description: |
sidebar_label: Pgvector [PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It
sidebar_position: 11 allows you to store and query vectors directly in memory.
That means you'll get fast and efficient vector retrieval.
## Features
- Easy to use
- Fully integrated with Llama Stack
There are three implementations of search for PGVectoIndex available:
1. Vector Search:
- How it works:
- Uses PostgreSQL's vector extension (pgvector) to perform similarity search
- Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
- Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
-Characteristics:
- Semantic understanding - finds documents similar in meaning even if they don't share keywords
- Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
- Best for: Finding conceptually related content, handling synonyms, cross-language search
2. Keyword Search
- How it works:
- Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
- Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
- Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
- Characteristics:
- Lexical matching - finds exact keyword matches and variations
- Uses GIN (Generalized Inverted Index) for fast text search performance
- Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
- Best for: Exact term matching, proper names, technical terms, Boolean-style queries
3. Hybrid Search
- How it works:
- Combines both vector and keyword search results
- Runs both searches independently, then merges results using configurable reranking
- Two reranking strategies available:
- Reciprocal Rank Fusion (RRF) - (default: 60.0)
- Weighted Average - (default: 0.5)
- Characteristics:
- Best of both worlds: semantic understanding + exact matching
- Documents appearing in both searches get boosted scores
- Configurable balance between semantic and lexical matching
- Best for: General-purpose search where you want both precision and recall
4. Database Schema
The PGVector implementation stores data optimized for all three search types:
CREATE TABLE vector_store_xxx (
id TEXT PRIMARY KEY,
document JSONB, -- Original document
embedding vector(dimension), -- For vector search
content_text TEXT, -- Raw text content
tokenized_content TSVECTOR -- For keyword search
);
-- Indexes for performance
CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content); -- Keyword search
-- Vector index created automatically by pgvector
## Usage
To use PGVector in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
3. Start storing and querying vectors.
## This is an example how you can set up your environment for using PGVector
1. Export env vars:
```bash
export ENABLE_PGVECTOR=true
export PGVECTOR_HOST=localhost
export PGVECTOR_PORT=5432
export PGVECTOR_DB=llamastack
export PGVECTOR_USER=llamastack
export PGVECTOR_PASSWORD=llamastack
```
2. Create DB:
```bash
psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
```
## Installation
You can install PGVector using docker:
```bash
docker pull pgvector/pgvector:pg17
```
## Documentation
See [PGVector's documentation](https://github.com/pgvector/pgvector) for more details about PGVector in general.
sidebar_label: Remote - Pgvector
title: remote::pgvector title: remote::pgvector
--- ---
@ -9,10 +107,108 @@ title: remote::pgvector
## Description ## Description
[PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It [PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It
allows you to store and query vectors directly in memory. allows you to store and query vectors directly in memory.
That means you'll get fast and efficient vector retrieval. That means you'll get fast and efficient vector retrieval.
## Features
- Easy to use
- Fully integrated with Llama Stack
There are three implementations of search for PGVectoIndex available:
1. Vector Search:
- How it works:
- Uses PostgreSQL's vector extension (pgvector) to perform similarity search
- Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
- Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
-Characteristics:
- Semantic understanding - finds documents similar in meaning even if they don't share keywords
- Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
- Best for: Finding conceptually related content, handling synonyms, cross-language search
2. Keyword Search
- How it works:
- Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
- Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
- Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
- Characteristics:
- Lexical matching - finds exact keyword matches and variations
- Uses GIN (Generalized Inverted Index) for fast text search performance
- Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
- Best for: Exact term matching, proper names, technical terms, Boolean-style queries
3. Hybrid Search
- How it works:
- Combines both vector and keyword search results
- Runs both searches independently, then merges results using configurable reranking
- Two reranking strategies available:
- Reciprocal Rank Fusion (RRF) - (default: 60.0)
- Weighted Average - (default: 0.5)
- Characteristics:
- Best of both worlds: semantic understanding + exact matching
- Documents appearing in both searches get boosted scores
- Configurable balance between semantic and lexical matching
- Best for: General-purpose search where you want both precision and recall
4. Database Schema
The PGVector implementation stores data optimized for all three search types:
CREATE TABLE vector_store_xxx (
id TEXT PRIMARY KEY,
document JSONB, -- Original document
embedding vector(dimension), -- For vector search
content_text TEXT, -- Raw text content
tokenized_content TSVECTOR -- For keyword search
);
-- Indexes for performance
CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content); -- Keyword search
-- Vector index created automatically by pgvector
## Usage
To use PGVector in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
3. Start storing and querying vectors.
## This is an example how you can set up your environment for using PGVector
1. Export env vars:
```bash
export ENABLE_PGVECTOR=true
export PGVECTOR_HOST=localhost
export PGVECTOR_PORT=5432
export PGVECTOR_DB=llamastack
export PGVECTOR_USER=llamastack
export PGVECTOR_PASSWORD=llamastack
```
2. Create DB:
```bash
psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
```
## Installation
You can install PGVector using docker:
```bash
docker pull pgvector/pgvector:pg17
```
## Documentation
See [PGVector's documentation](https://github.com/pgvector/pgvector) for more details about PGVector in general.
## Configuration ## Configuration
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |

View file

@ -1,7 +1,6 @@
--- ---
description: Please refer to the inline provider documentation description: "Please refer to the inline provider documentation."
sidebar_label: Qdrant sidebar_label: Remote - Qdrant
sidebar_position: 12
title: remote::qdrant title: remote::qdrant
--- ---
@ -9,8 +8,10 @@ title: remote::qdrant
## Description ## Description
Please refer to the inline provider documentation. Please refer to the inline provider documentation.
## Configuration ## Configuration
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |

View file

@ -1,7 +1,35 @@
--- ---
description: '[Weaviate](https://weaviate' description: |
sidebar_label: Weaviate [Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack.
sidebar_position: 13 It allows you to store and query vectors directly within a Weaviate database.
That means you're not limited to storing vectors in memory or in a separate service.
## Features
Weaviate supports:
- Store embeddings and their metadata
- Vector search
- Full-text search
- Hybrid search
- Document storage
- Metadata filtering
- Multi-modal retrieval
## Usage
To use Weaviate in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use chroma.
3. Start storing and querying vectors.
## Installation
To install Weaviate see the [Weaviate quickstart documentation](https://weaviate.io/developers/weaviate/quickstart).
## Documentation
See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general.
sidebar_label: Remote - Weaviate
title: remote::weaviate title: remote::weaviate
--- ---
@ -9,10 +37,38 @@ title: remote::weaviate
## Description ## Description
[Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack. [Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack.
It allows you to store and query vectors directly within a Weaviate database. It allows you to store and query vectors directly within a Weaviate database.
That means you're not limited to storing vectors in memory or in a separate service. That means you're not limited to storing vectors in memory or in a separate service.
## Features
Weaviate supports:
- Store embeddings and their metadata
- Vector search
- Full-text search
- Hybrid search
- Document storage
- Metadata filtering
- Multi-modal retrieval
## Usage
To use Weaviate in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use chroma.
3. Start storing and querying vectors.
## Installation
To install Weaviate see the [Weaviate quickstart documentation](https://weaviate.io/developers/weaviate/quickstart).
## Documentation
See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general.
## Configuration ## Configuration
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |

View file

@ -0,0 +1,125 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Meta Reference GPU Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-meta-reference-gpu` distribution consists of the following provider configurations:
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| inference | `inline::meta-reference` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
### Environment Variables
The following environment variables can be configured:
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
- `SAFETY_CHECKPOINT_DIR`: Directory containing the Llama-Guard model checkpoint (default: `null`)
## Prerequisite: Downloading Models
Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
```
$ llama model list --downloaded
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
┃ Model ┃ Size ┃ Modified Time ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ Llama3.2-1B-Instruct:int4-qlora-eo8 │ 1.53 GB │ 2025-02-26 11:22:28 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-1B │ 2.31 GB │ 2025-02-18 21:48:52 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Prompt-Guard-86M │ 0.02 GB │ 2025-02-26 11:29:28 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB │ 2025-02-26 11:37:41 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-3B │ 5.99 GB │ 2025-02-18 21:51:26 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.1-8B │ 14.97 GB │ 2025-02-16 10:36:37 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB │ 2025-02-26 11:35:02 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama-Guard-3-1B │ 2.80 GB │ 2025-02-26 11:20:46 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama-Guard-3-1B:int4 │ 0.43 GB │ 2025-02-26 11:33:33 │
└─────────────────────────────────────────┴──────────┴─────────────────────┘
```
## Running the Distribution
You can do this via venv or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=8321
docker run \
-it \
--pull always \
--gpu all \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
llamastack/distribution-meta-reference-gpu \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
docker run \
-it \
--pull always \
--gpu all \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
llamastack/distribution-meta-reference-gpu \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
```
### Via venv
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --distro meta-reference-gpu --image-type venv
llama stack run distributions/meta-reference-gpu/run.yaml \
--port 8321 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
--port 8321 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
```

View file

@ -0,0 +1,171 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# NVIDIA Distribution
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| datasetio | `inline::localfs`, `remote::nvidia` |
| eval | `remote::nvidia` |
| files | `inline::localfs` |
| inference | `remote::nvidia` |
| post_training | `remote::nvidia` |
| safety | `remote::nvidia` |
| scoring | `inline::basic` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `inline::rag-runtime` |
| vector_io | `inline::faiss` |
### Environment Variables
The following environment variables can be configured:
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
- `NVIDIA_GUARDRAILS_CONFIG_ID`: NVIDIA Guardrail Configuration ID (default: `self-check`)
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
### Models
The following models are available by default:
- `meta/llama3-8b-instruct `
- `meta/llama3-70b-instruct `
- `meta/llama-3.1-8b-instruct `
- `meta/llama-3.1-70b-instruct `
- `meta/llama-3.1-405b-instruct `
- `meta/llama-3.2-1b-instruct `
- `meta/llama-3.2-3b-instruct `
- `meta/llama-3.2-11b-vision-instruct `
- `meta/llama-3.2-90b-vision-instruct `
- `meta/llama-3.3-70b-instruct `
- `nvidia/vila `
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
- `nvidia/nv-embedqa-e5-v5 `
- `nvidia/nv-embedqa-mistral-7b-v2 `
- `snowflake/arctic-embed-l `
## Prerequisites
### NVIDIA API Keys
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
### Deploy NeMo Microservices Platform
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
## Supported Services
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
### Inference: NVIDIA NIM
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
### Datasetio API: NeMo Data Store
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
See the {repopath}`NVIDIA Datasetio docs::llama_stack/providers/remote/datasetio/nvidia/README.md` for supported features and example usage.
### Eval API: NeMo Evaluator
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
See the {repopath}`NVIDIA Eval docs::llama_stack/providers/remote/eval/nvidia/README.md` for supported features and example usage.
### Post-Training API: NeMo Customizer
The NeMo Customizer microservice supports fine-tuning models. You can reference {repopath}`this list of supported models::llama_stack/providers/remote/post_training/nvidia/models.py` that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
See the {repopath}`NVIDIA Post-Training docs::llama_stack/providers/remote/post_training/nvidia/README.md` for supported features and example usage.
### Safety API: NeMo Guardrails
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
See the {repopath}`NVIDIA Safety docs::llama_stack/providers/remote/safety/nvidia/README.md` for supported features and example usage.
## Deploying models
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
```sh
# URL to NeMo NIM Proxy service
export NEMO_URL="http://nemo.test"
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"name": "llama-3.2-1b-instruct",
"namespace": "meta",
"config": {
"model": "meta/llama-3.2-1b-instruct",
"nim_deployment": {
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
"image_tag": "1.8.3",
"pvc_size": "25Gi",
"gpu": 1,
"additional_envs": {
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
}
}
}
}'
```
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
You can also remove a deployed NIM to free up GPU resources, if needed.
```sh
export NEMO_URL="http://nemo.test"
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
```
## Running Llama Stack with NVIDIA
You can do this via venv (build code), or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=8321
docker run \
-it \
--pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-nvidia \
--config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
```
### Via venv
If you've set up your local development environment, you can also build the image using your local virtual environment.
```bash
INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
llama stack build --distro nvidia --image-type venv
llama stack run ./run.yaml \
--port 8321 \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--env INFERENCE_MODEL=$INFERENCE_MODEL
```
## Example Notebooks
For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in {repopath}`docs/notebooks/nvidia`.

View file

@ -10,11 +10,11 @@ import sys
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from llama_stack.core.distribution import get_provider_registry
from pydantic_core import PydanticUndefined from pydantic_core import PydanticUndefined
from rich.progress import Progress, SpinnerColumn, TextColumn from rich.progress import Progress, SpinnerColumn, TextColumn
from llama_stack.core.distribution import get_provider_registry
REPO_ROOT = Path(__file__).parent.parent REPO_ROOT = Path(__file__).parent.parent
@ -22,9 +22,7 @@ def get_api_docstring(api_name: str) -> str | None:
"""Extract docstring from the API protocol class.""" """Extract docstring from the API protocol class."""
try: try:
# Import the API module dynamically # Import the API module dynamically
api_module = __import__( api_module = __import__(f"llama_stack.apis.{api_name}", fromlist=[api_name.title()])
f"llama_stack.apis.{api_name}", fromlist=[api_name.title()]
)
# Get the main protocol class (usually capitalized API name) # Get the main protocol class (usually capitalized API name)
protocol_class_name = api_name.title() protocol_class_name = api_name.title()
@ -72,10 +70,7 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
model_config = config_class.model_config model_config = config_class.model_config
if hasattr(model_config, "extra") and model_config.extra == "allow": if hasattr(model_config, "extra") and model_config.extra == "allow":
accepts_extra_config = True accepts_extra_config = True
elif ( elif isinstance(model_config, dict) and model_config.get("extra") == "allow":
isinstance(model_config, dict)
and model_config.get("extra") == "allow"
):
accepts_extra_config = True accepts_extra_config = True
fields_info = {} fields_info = {}
@ -84,19 +79,9 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
field_type = str(field.annotation) if field.annotation else "Any" field_type = str(field.annotation) if field.annotation else "Any"
# this string replace is ridiculous # this string replace is ridiculous
field_type = ( field_type = field_type.replace("typing.", "").replace("Optional[", "").replace("]", "")
field_type.replace("typing.", "") field_type = field_type.replace("Annotated[", "").replace("FieldInfo(", "").replace(")", "")
.replace("Optional[", "") field_type = field_type.replace("llama_stack.apis.inference.inference.", "")
.replace("]", "")
)
field_type = (
field_type.replace("Annotated[", "")
.replace("FieldInfo(", "")
.replace(")", "")
)
field_type = field_type.replace(
"llama_stack.apis.inference.inference.", ""
)
field_type = field_type.replace("llama_stack.providers.", "") field_type = field_type.replace("llama_stack.providers.", "")
default_value = field.default default_value = field.default
@ -106,10 +91,7 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
# HACK ALERT: # HACK ALERT:
# If the default value contains a path that looks like it came from RUNTIME_BASE_DIR, # If the default value contains a path that looks like it came from RUNTIME_BASE_DIR,
# replace it with a generic ~/.llama/ path for documentation # replace it with a generic ~/.llama/ path for documentation
if ( if isinstance(default_value, str) and "/.llama/" in default_value:
isinstance(default_value, str)
and "/.llama/" in default_value
):
if ".llama/" in default_value: if ".llama/" in default_value:
path_part = default_value.split(".llama/")[-1] path_part = default_value.split(".llama/")[-1]
default_value = f"~/.llama/{path_part}" default_value = f"~/.llama/{path_part}"
@ -135,11 +117,7 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
lines = source.split("\n") lines = source.split("\n")
for i, line in enumerate(lines): for i, line in enumerate(lines):
if ( if "model_config" in line and "ConfigDict" in line and 'extra="allow"' in line:
"model_config" in line
and "ConfigDict" in line
and 'extra="allow"' in line
):
comments = [] comments = []
for j in range(i - 1, -1, -1): for j in range(i - 1, -1, -1):
stripped = lines[j].strip() stripped = lines[j].strip()
@ -204,9 +182,7 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
# Create sidebar label (clean up provider_type for display) # Create sidebar label (clean up provider_type for display)
sidebar_label = provider_type.replace("::", " - ").replace("_", " ") sidebar_label = provider_type.replace("::", " - ").replace("_", " ")
if sidebar_label.startswith("inline - "): if sidebar_label.startswith("inline - "):
sidebar_label = sidebar_label[ sidebar_label = sidebar_label[9:].title() # Remove "inline - " prefix and title case
9:
].title() # Remove "inline - " prefix and title case
else: else:
sidebar_label = sidebar_label.title() sidebar_label = sidebar_label.title()
@ -219,7 +195,8 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
if "\n" in description.strip(): if "\n" in description.strip():
md_lines.append("description: |") md_lines.append("description: |")
for line in description.strip().split("\n"): for line in description.strip().split("\n"):
md_lines.append(f" {line}") # Avoid trailing whitespace by only adding spaces to non-empty lines
md_lines.append(f" {line}" if line.strip() else "")
else: else:
# For single line descriptions, format properly for YAML # For single line descriptions, format properly for YAML
clean_desc = description.strip().replace('"', '\\"') clean_desc = description.strip().replace('"', '\\"')
@ -248,14 +225,10 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
for field_name, field_info in config_info["fields"].items(): for field_name, field_info in config_info["fields"].items():
field_type = field_info["type"].replace("|", "\\|") field_type = field_info["type"].replace("|", "\\|")
required = "Yes" if field_info["required"] else "No" required = "Yes" if field_info["required"] else "No"
default = ( default = str(field_info["default"]) if field_info["default"] is not None else ""
str(field_info["default"]) if field_info["default"] is not None else ""
)
description_text = field_info["description"] or "" description_text = field_info["description"] or ""
md_lines.append( md_lines.append(f"| `{field_name}` | `{field_type}` | {required} | {default} | {description_text} |")
f"| `{field_name}` | `{field_type}` | {required} | {default} | {description_text} |"
)
md_lines.append("") md_lines.append("")
@ -297,22 +270,16 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
return obj return obj
sample_config_dict = convert_pydantic_to_dict(sample_config) sample_config_dict = convert_pydantic_to_dict(sample_config)
md_lines.append( # Strip trailing newlines from yaml.dump to prevent extra blank lines
yaml.dump( yaml_output = yaml.dump(sample_config_dict, default_flow_style=False, sort_keys=False).rstrip()
sample_config_dict, default_flow_style=False, sort_keys=False md_lines.append(yaml_output)
)
)
else: else:
md_lines.append("# No sample configuration available.") md_lines.append("# No sample configuration available.")
except Exception as e: except Exception as e:
md_lines.append(f"# Error generating sample config: {str(e)}") md_lines.append(f"# Error generating sample config: {str(e)}")
md_lines.append("```") md_lines.append("```")
md_lines.append("")
if ( if hasattr(provider_spec, "deprecation_warning") and provider_spec.deprecation_warning:
hasattr(provider_spec, "deprecation_warning")
and provider_spec.deprecation_warning
):
md_lines.append("## Deprecation Notice") md_lines.append("## Deprecation Notice")
md_lines.append("") md_lines.append("")
md_lines.append(":::warning") md_lines.append(":::warning")
@ -330,9 +297,7 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
return "\n".join(md_lines) + "\n" return "\n".join(md_lines) + "\n"
def generate_index_docs( def generate_index_docs(api_name: str, api_docstring: str | None, provider_entries: list) -> str:
api_name: str, api_docstring: str | None, provider_entries: list
) -> str:
"""Generate MDX documentation for the index file.""" """Generate MDX documentation for the index file."""
# Create sidebar label for the API # Create sidebar label for the API
sidebar_label = api_name.replace("_", " ").title() sidebar_label = api_name.replace("_", " ").title()
@ -360,9 +325,7 @@ def generate_index_docs(
md_lines.append(f"{cleaned_docstring}") md_lines.append(f"{cleaned_docstring}")
md_lines.append("") md_lines.append("")
md_lines.append( md_lines.append(f"This section contains documentation for all available providers for the **{api_name}** API.")
f"This section contains documentation for all available providers for the **{api_name}** API."
)
md_lines.append("") md_lines.append("")
md_lines.append("## Providers") md_lines.append("## Providers")
@ -373,9 +336,8 @@ def generate_index_docs(
provider_name = entry["display_name"] provider_name = entry["display_name"]
filename = entry["filename"] filename = entry["filename"]
md_lines.append(f"- [{provider_name}](./{filename})") md_lines.append(f"- [{provider_name}](./{filename})")
md_lines.append("")
return "\n".join(md_lines) return "\n".join(md_lines) + "\n"
def process_provider_registry(progress, change_tracker: ChangedPathTracker) -> None: def process_provider_registry(progress, change_tracker: ChangedPathTracker) -> None:
@ -411,14 +373,10 @@ def process_provider_registry(progress, change_tracker: ChangedPathTracker) -> N
else: else:
display_name = display_name.title() display_name = display_name.title()
provider_entries.append( provider_entries.append({"filename": filename, "display_name": display_name})
{"filename": filename, "display_name": display_name}
)
# Generate index file with frontmatter # Generate index file with frontmatter
index_content = generate_index_docs( index_content = generate_index_docs(api_name, api_docstring, provider_entries)
api_name, api_docstring, provider_entries
)
index_file = doc_output_dir / "index.mdx" index_file = doc_output_dir / "index.mdx"
index_file.write_text(index_content) index_file.write_text(index_content)
change_tracker.add_paths(index_file) change_tracker.add_paths(index_file)