mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
provider codegen fixes
This commit is contained in:
parent
29d84570c3
commit
04bf9e6f80
80 changed files with 1875 additions and 433 deletions
|
@ -1,7 +1,13 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the agents API
|
description: "Agents API for creating and interacting with agentic systems.
|
||||||
sidebar_label: Overview
|
|
||||||
sidebar_position: 1
|
Main functionalities provided by this API:
|
||||||
|
- Create agents with specific instructions and ability to use tools.
|
||||||
|
- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".
|
||||||
|
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
|
||||||
|
- Agents can be provided with various shields (see the Safety API for more details).
|
||||||
|
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details."
|
||||||
|
sidebar_label: Agents
|
||||||
title: Agents
|
title: Agents
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -22,4 +28,4 @@ This section contains documentation for all available providers for the **agents
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Meta Reference](./inline_meta-reference)** - Inline provider
|
- [Meta-Reference](./inline_meta-reference)
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Meta's reference implementation of an agent system that can use tools,
|
description: "Meta's reference implementation of an agent system that can use tools, access vector databases, and perform complex reasoning tasks."
|
||||||
access vector databases, and perform complex reasoning tasks
|
sidebar_label: Meta-Reference
|
||||||
sidebar_label: Meta Reference
|
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::meta-reference
|
title: inline::meta-reference
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,15 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the batches API
|
description: "The Batches API enables efficient processing of multiple requests in a single operation,
|
||||||
sidebar_label: Overview
|
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||||
sidebar_position: 1
|
cost-effective inference at scale.
|
||||||
|
|
||||||
|
The API is designed to allow use of openai client libraries for seamless integration.
|
||||||
|
|
||||||
|
This API provides the following extensions:
|
||||||
|
- idempotent batch creation
|
||||||
|
|
||||||
|
Note: This API is currently under active development and may undergo changes."
|
||||||
|
sidebar_label: Batches
|
||||||
title: Batches
|
title: Batches
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -24,4 +32,4 @@ This section contains documentation for all available providers for the **batche
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Reference](./inline_reference)** - Inline provider
|
- [Reference](./inline_reference)
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Reference implementation of batches API with KVStore persistence
|
description: "Reference implementation of batches API with KVStore persistence."
|
||||||
sidebar_label: Reference
|
sidebar_label: Reference
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::reference
|
title: inline::reference
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the datasetio API
|
sidebar_label: Datasetio
|
||||||
sidebar_label: Overview
|
|
||||||
sidebar_position: 1
|
|
||||||
title: Datasetio
|
title: Datasetio
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -13,6 +11,6 @@ This section contains documentation for all available providers for the **datase
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Localfs](./inline_localfs)** - Inline provider
|
- [Localfs](./inline_localfs)
|
||||||
- **[Huggingface](./remote_huggingface)** - Remote provider
|
- [Remote - Huggingface](./remote_huggingface)
|
||||||
- **[Nvidia](./remote_nvidia)** - Remote provider
|
- [Remote - Nvidia](./remote_nvidia)
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Local filesystem-based dataset I/O provider for reading and writing datasets
|
description: "Local filesystem-based dataset I/O provider for reading and writing datasets to local storage."
|
||||||
to local storage
|
|
||||||
sidebar_label: Localfs
|
sidebar_label: Localfs
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::localfs
|
title: inline::localfs
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: HuggingFace datasets provider for accessing and managing datasets from
|
description: "HuggingFace datasets provider for accessing and managing datasets from the HuggingFace Hub."
|
||||||
the HuggingFace Hub
|
sidebar_label: Remote - Huggingface
|
||||||
sidebar_label: Huggingface
|
|
||||||
sidebar_position: 3
|
|
||||||
title: remote::huggingface
|
title: remote::huggingface
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data
|
description: "NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data platform."
|
||||||
platform
|
sidebar_label: Remote - Nvidia
|
||||||
sidebar_label: Nvidia
|
|
||||||
sidebar_position: 4
|
|
||||||
title: remote::nvidia
|
title: remote::nvidia
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the eval API
|
description: "Llama Stack Evaluation API for running evaluations on model and agent candidates."
|
||||||
sidebar_label: Overview
|
sidebar_label: Eval
|
||||||
sidebar_position: 1
|
|
||||||
title: Eval
|
title: Eval
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -15,5 +14,5 @@ This section contains documentation for all available providers for the **eval**
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Meta Reference](./inline_meta-reference)** - Inline provider
|
- [Meta-Reference](./inline_meta-reference)
|
||||||
- **[Nvidia](./remote_nvidia)** - Remote provider
|
- [Remote - Nvidia](./remote_nvidia)
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Meta's reference implementation of evaluation tasks with support for
|
description: "Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics."
|
||||||
multiple languages and evaluation metrics
|
sidebar_label: Meta-Reference
|
||||||
sidebar_label: Meta Reference
|
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::meta-reference
|
title: inline::meta-reference
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's
|
description: "NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform."
|
||||||
platform
|
sidebar_label: Remote - Nvidia
|
||||||
sidebar_label: Nvidia
|
|
||||||
sidebar_position: 3
|
|
||||||
title: remote::nvidia
|
title: remote::nvidia
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the files API
|
sidebar_label: Files
|
||||||
sidebar_label: Overview
|
|
||||||
sidebar_position: 1
|
|
||||||
title: Files
|
title: Files
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -13,5 +11,5 @@ This section contains documentation for all available providers for the **files*
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Localfs](./inline_localfs)** - Inline provider
|
- [Localfs](./inline_localfs)
|
||||||
- **[S3](./remote_s3)** - Remote provider
|
- [Remote - S3](./remote_s3)
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Local filesystem-based file storage provider for managing files and documents
|
description: "Local filesystem-based file storage provider for managing files and documents locally."
|
||||||
locally
|
|
||||||
sidebar_label: Localfs
|
sidebar_label: Localfs
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::localfs
|
title: inline::localfs
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: AWS S3-based file storage provider for scalable cloud file management
|
description: "AWS S3-based file storage provider for scalable cloud file management with metadata persistence."
|
||||||
with metadata persistence
|
sidebar_label: Remote - S3
|
||||||
sidebar_label: S3
|
|
||||||
sidebar_position: 3
|
|
||||||
title: remote::s3
|
title: remote::s3
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the inference API
|
description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
||||||
sidebar_label: Overview
|
|
||||||
sidebar_position: 1
|
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
||||||
|
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
|
||||||
|
- Embedding models: these models generate embeddings to be used for semantic search."
|
||||||
|
sidebar_label: Inference
|
||||||
title: Inference
|
title: Inference
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -19,28 +22,27 @@ This section contains documentation for all available providers for the **infere
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Meta Reference](./inline_meta-reference)** - Inline provider
|
- [Meta-Reference](./inline_meta-reference)
|
||||||
- **[Sentence Transformers](./inline_sentence-transformers)** - Inline provider
|
- [Sentence-Transformers](./inline_sentence-transformers)
|
||||||
- **[Anthropic](./remote_anthropic)** - Remote provider
|
- [Remote - Anthropic](./remote_anthropic)
|
||||||
- **[Azure](./remote_azure)** - Remote provider
|
- [Remote - Azure](./remote_azure)
|
||||||
- **[Bedrock](./remote_bedrock)** - Remote provider
|
- [Remote - Bedrock](./remote_bedrock)
|
||||||
- **[Cerebras](./remote_cerebras)** - Remote provider
|
- [Remote - Cerebras](./remote_cerebras)
|
||||||
- **[Databricks](./remote_databricks)** - Remote provider
|
- [Remote - Databricks](./remote_databricks)
|
||||||
- **[Fireworks](./remote_fireworks)** - Remote provider
|
- [Remote - Fireworks](./remote_fireworks)
|
||||||
- **[Gemini](./remote_gemini)** - Remote provider
|
- [Remote - Gemini](./remote_gemini)
|
||||||
- **[Groq](./remote_groq)** - Remote provider
|
- [Remote - Groq](./remote_groq)
|
||||||
- **[Hugging Face Endpoint](./remote_hf_endpoint)** - Remote provider
|
- [Remote - Hf - Endpoint](./remote_hf_endpoint)
|
||||||
- **[Hugging Face Serverless](./remote_hf_serverless)** - Remote provider
|
- [Remote - Hf - Serverless](./remote_hf_serverless)
|
||||||
- **[Llama OpenAI Compatible](./remote_llama-openai-compat)** - Remote provider
|
- [Remote - Llama-Openai-Compat](./remote_llama-openai-compat)
|
||||||
- **[Nvidia](./remote_nvidia)** - Remote provider
|
- [Remote - Nvidia](./remote_nvidia)
|
||||||
- **[Ollama](./remote_ollama)** - Remote provider
|
- [Remote - Ollama](./remote_ollama)
|
||||||
- **[Openai](./remote_openai)** - Remote provider
|
- [Remote - Openai](./remote_openai)
|
||||||
- **[Passthrough](./remote_passthrough)** - Remote provider
|
- [Remote - Passthrough](./remote_passthrough)
|
||||||
- **[Runpod](./remote_runpod)** - Remote provider
|
- [Remote - Runpod](./remote_runpod)
|
||||||
- **[Sambanova](./remote_sambanova)** - Remote provider
|
- [Remote - Sambanova](./remote_sambanova)
|
||||||
- **[SambaNova OpenAI Compatible](./remote_sambanova-openai-compat)** - Remote provider
|
- [Remote - Tgi](./remote_tgi)
|
||||||
- **[Tgi](./remote_tgi)** - Remote provider
|
- [Remote - Together](./remote_together)
|
||||||
- **[Together](./remote_together)** - Remote provider
|
- [Remote - Vertexai](./remote_vertexai)
|
||||||
- **[Vertexai](./remote_vertexai)** - Remote provider
|
- [Remote - Vllm](./remote_vllm)
|
||||||
- **[Vllm](./remote_vllm)** - Remote provider
|
- [Remote - Watsonx](./remote_watsonx)
|
||||||
- **[Watsonx](./remote_watsonx)** - Remote provider
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Meta's reference implementation of inference with support for various
|
description: "Meta's reference implementation of inference with support for various model formats and optimization techniques."
|
||||||
model formats and optimization techniques
|
sidebar_label: Meta-Reference
|
||||||
sidebar_label: Meta Reference
|
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::meta-reference
|
title: inline::meta-reference
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Sentence Transformers inference provider for text embeddings and similarity
|
description: "Sentence Transformers inference provider for text embeddings and similarity search."
|
||||||
search
|
sidebar_label: Sentence-Transformers
|
||||||
sidebar_label: Sentence Transformers
|
|
||||||
sidebar_position: 3
|
|
||||||
title: inline::sentence-transformers
|
title: inline::sentence-transformers
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -12,10 +10,6 @@ title: inline::sentence-transformers
|
||||||
|
|
||||||
Sentence Transformers inference provider for text embeddings and similarity search.
|
Sentence Transformers inference provider for text embeddings and similarity search.
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
No configuration options available.
|
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Anthropic inference provider for accessing Claude models and Anthropic's
|
description: "Anthropic inference provider for accessing Claude models and Anthropic's AI services."
|
||||||
AI services
|
sidebar_label: Remote - Anthropic
|
||||||
sidebar_label: Anthropic
|
|
||||||
sidebar_position: 4
|
|
||||||
title: remote::anthropic
|
title: remote::anthropic
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
---
|
---
|
||||||
description: Azure OpenAI inference provider for accessing GPT models and other Azure
|
description: |
|
||||||
services
|
Azure OpenAI inference provider for accessing GPT models and other Azure services.
|
||||||
sidebar_label: Azure
|
Provider documentation
|
||||||
sidebar_position: 5
|
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
|
||||||
|
sidebar_label: Remote - Azure
|
||||||
title: remote::azure
|
title: remote::azure
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -10,10 +11,12 @@ title: remote::azure
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
Azure OpenAI inference provider for accessing GPT models and other Azure services.
|
Azure OpenAI inference provider for accessing GPT models and other Azure services.
|
||||||
Provider documentation
|
Provider documentation
|
||||||
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
|
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: AWS Bedrock inference provider for accessing various AI models through
|
description: "AWS Bedrock inference provider for accessing various AI models through AWS's managed service."
|
||||||
AWS's managed service
|
sidebar_label: Remote - Bedrock
|
||||||
sidebar_label: Bedrock
|
|
||||||
sidebar_position: 6
|
|
||||||
title: remote::bedrock
|
title: remote::bedrock
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Cerebras inference provider for running models on Cerebras Cloud platform
|
description: "Cerebras inference provider for running models on Cerebras Cloud platform."
|
||||||
sidebar_label: Cerebras
|
sidebar_label: Remote - Cerebras
|
||||||
sidebar_position: 7
|
|
||||||
title: remote::cerebras
|
title: remote::cerebras
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Databricks inference provider for running models on Databricks' unified
|
description: "Databricks inference provider for running models on Databricks' unified analytics platform."
|
||||||
analytics platform
|
sidebar_label: Remote - Databricks
|
||||||
sidebar_label: Databricks
|
|
||||||
sidebar_position: 8
|
|
||||||
title: remote::databricks
|
title: remote::databricks
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Fireworks AI inference provider for Llama models and other AI models
|
description: "Fireworks AI inference provider for Llama models and other AI models on the Fireworks platform."
|
||||||
on the Fireworks platform
|
sidebar_label: Remote - Fireworks
|
||||||
sidebar_label: Fireworks
|
|
||||||
sidebar_position: 9
|
|
||||||
title: remote::fireworks
|
title: remote::fireworks
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Google Gemini inference provider for accessing Gemini models and Google's
|
description: "Google Gemini inference provider for accessing Gemini models and Google's AI services."
|
||||||
AI services
|
sidebar_label: Remote - Gemini
|
||||||
sidebar_label: Gemini
|
|
||||||
sidebar_position: 10
|
|
||||||
title: remote::gemini
|
title: remote::gemini
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Groq inference provider for ultra-fast inference using Groq's LPU technology
|
description: "Groq inference provider for ultra-fast inference using Groq's LPU technology."
|
||||||
sidebar_label: Groq
|
sidebar_label: Remote - Groq
|
||||||
sidebar_position: 11
|
|
||||||
title: remote::groq
|
title: remote::groq
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: HuggingFace Inference Endpoints provider for dedicated model serving
|
description: "HuggingFace Inference Endpoints provider for dedicated model serving."
|
||||||
sidebar_label: Hugging Face Endpoint
|
sidebar_label: Remote - Hf - Endpoint
|
||||||
sidebar_position: 12
|
|
||||||
title: remote::hf::endpoint
|
title: remote::hf::endpoint
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -15,8 +14,8 @@ HuggingFace Inference Endpoints provider for dedicated model serving.
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `endpoint_name` | `<class 'str'>` | No | | The name of the Hugging Face Inference Endpoint in the format of `{namespace}/{endpoint_name}` (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
|
| `endpoint_name` | `<class 'str'>` | No | | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
|
||||||
| `api_token` | `pydantic.types.SecretStr or None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
|
| `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: HuggingFace Inference API serverless provider for on-demand model inference
|
description: "HuggingFace Inference API serverless provider for on-demand model inference."
|
||||||
sidebar_label: Hugging Face Serverless
|
sidebar_label: Remote - Hf - Serverless
|
||||||
sidebar_position: 13
|
|
||||||
title: remote::hf::serverless
|
title: remote::hf::serverless
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Llama OpenAI-compatible provider for using Llama models with OpenAI API
|
description: "Llama OpenAI-compatible provider for using Llama models with OpenAI API format."
|
||||||
format
|
sidebar_label: Remote - Llama-Openai-Compat
|
||||||
sidebar_label: Llama OpenAI Compatible
|
|
||||||
sidebar_position: 14
|
|
||||||
title: remote::llama-openai-compat
|
title: remote::llama-openai-compat
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: NVIDIA inference provider for accessing NVIDIA NIM models and AI services
|
description: "NVIDIA inference provider for accessing NVIDIA NIM models and AI services."
|
||||||
sidebar_label: Nvidia
|
sidebar_label: Remote - Nvidia
|
||||||
sidebar_position: 15
|
|
||||||
title: remote::nvidia
|
title: remote::nvidia
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Ollama inference provider for running local models through the Ollama
|
description: "Ollama inference provider for running local models through the Ollama runtime."
|
||||||
runtime
|
sidebar_label: Remote - Ollama
|
||||||
sidebar_label: Ollama
|
|
||||||
sidebar_position: 16
|
|
||||||
title: remote::ollama
|
title: remote::ollama
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: OpenAI inference provider for accessing GPT models and other OpenAI services
|
description: "OpenAI inference provider for accessing GPT models and other OpenAI services."
|
||||||
sidebar_label: Openai
|
sidebar_label: Remote - Openai
|
||||||
sidebar_position: 17
|
|
||||||
title: remote::openai
|
title: remote::openai
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Passthrough inference provider for connecting to any external inference
|
description: "Passthrough inference provider for connecting to any external inference service not directly supported."
|
||||||
service not directly supported
|
sidebar_label: Remote - Passthrough
|
||||||
sidebar_label: Passthrough
|
|
||||||
sidebar_position: 18
|
|
||||||
title: remote::passthrough
|
title: remote::passthrough
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: RunPod inference provider for running models on RunPod's cloud GPU platform
|
description: "RunPod inference provider for running models on RunPod's cloud GPU platform."
|
||||||
sidebar_label: Runpod
|
sidebar_label: Remote - Runpod
|
||||||
sidebar_position: 19
|
|
||||||
title: remote::runpod
|
title: remote::runpod
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: SambaNova inference provider for running models on SambaNova's dataflow
|
description: "SambaNova inference provider for running models on SambaNova's dataflow architecture."
|
||||||
architecture
|
sidebar_label: Remote - Sambanova
|
||||||
sidebar_label: Sambanova
|
|
||||||
sidebar_position: 20
|
|
||||||
title: remote::sambanova
|
title: remote::sambanova
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Text Generation Inference (TGI) provider for HuggingFace model serving
|
description: "Text Generation Inference (TGI) provider for HuggingFace model serving."
|
||||||
sidebar_label: Tgi
|
sidebar_label: Remote - Tgi
|
||||||
sidebar_position: 22
|
|
||||||
title: remote::tgi
|
title: remote::tgi
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Together AI inference provider for open-source models and collaborative
|
description: "Together AI inference provider for open-source models and collaborative AI development."
|
||||||
AI development
|
sidebar_label: Remote - Together
|
||||||
sidebar_label: Together
|
|
||||||
sidebar_position: 23
|
|
||||||
title: remote::together
|
title: remote::together
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,18 +1,26 @@
|
||||||
---
|
---
|
||||||
description: "Google Vertex AI inference provider enables you to use Google's Gemini\
|
description: |
|
||||||
\ models through Google Cloud's Vertex AI platform, providing several advantages:\n\
|
Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages:
|
||||||
\n\u2022 Enterprise-grade security: Uses Google Cloud's security controls and IAM\n\
|
|
||||||
\u2022 Better integration: Seamless integration with other Google Cloud services\n\
|
• Enterprise-grade security: Uses Google Cloud's security controls and IAM
|
||||||
\u2022 Advanced features: Access to additional Vertex AI features like model tuning\
|
• Better integration: Seamless integration with other Google Cloud services
|
||||||
\ and monitoring\n\u2022 Authentication: Uses Google Cloud Application Default Credentials\
|
• Advanced features: Access to additional Vertex AI features like model tuning and monitoring
|
||||||
\ (ADC) instead of API keys\n\nConfiguration:\n- Set VERTEX_AI_PROJECT environment\
|
• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys
|
||||||
\ variable (required)\n- Set VERTEX_AI_LOCATION environment variable (optional,\
|
|
||||||
\ defaults to us-central1)\n- Use Google Cloud Application Default Credentials or\
|
Configuration:
|
||||||
\ service account key\n\nAuthentication Setup:\nOption 1 (Recommended): gcloud auth\
|
- Set VERTEX_AI_PROJECT environment variable (required)
|
||||||
\ application-default login\nOption 2: Set GOOGLE_APPLICATION_CREDENTIALS to service\
|
- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1)
|
||||||
\ account key path\n\nAvailable Models:\n- vertex_ai/gemini-2"
|
- Use Google Cloud Application Default Credentials or service account key
|
||||||
sidebar_label: Vertexai
|
|
||||||
sidebar_position: 24
|
Authentication Setup:
|
||||||
|
Option 1 (Recommended): gcloud auth application-default login
|
||||||
|
Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path
|
||||||
|
|
||||||
|
Available Models:
|
||||||
|
- vertex_ai/gemini-2.0-flash
|
||||||
|
- vertex_ai/gemini-2.5-flash
|
||||||
|
- vertex_ai/gemini-2.5-pro
|
||||||
|
sidebar_label: Remote - Vertexai
|
||||||
title: remote::vertexai
|
title: remote::vertexai
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Remote vLLM inference provider for connecting to vLLM servers
|
description: "Remote vLLM inference provider for connecting to vLLM servers."
|
||||||
sidebar_label: Vllm
|
sidebar_label: Remote - Vllm
|
||||||
sidebar_position: 25
|
|
||||||
title: remote::vllm
|
title: remote::vllm
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: IBM WatsonX inference provider for accessing AI models on IBM's WatsonX
|
description: "IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform."
|
||||||
platform
|
sidebar_label: Remote - Watsonx
|
||||||
sidebar_label: Watsonx
|
|
||||||
sidebar_position: 26
|
|
||||||
title: remote::watsonx
|
title: remote::watsonx
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -17,8 +15,8 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
|
| `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
|
||||||
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The watsonx API key, only needed of using the hosted service |
|
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The watsonx API key |
|
||||||
| `project_id` | `str \| None` | No | | The Project ID key, only needed of using the hosted service |
|
| `project_id` | `str \| None` | No | | The Project ID key |
|
||||||
| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
|
| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the post_training API
|
sidebar_label: Post Training
|
||||||
sidebar_label: Overview
|
|
||||||
sidebar_position: 1
|
|
||||||
title: Post_Training
|
title: Post_Training
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -13,10 +11,7 @@ This section contains documentation for all available providers for the **post_t
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Huggingface](./inline_huggingface)** - Inline provider
|
- [Huggingface-Gpu](./inline_huggingface-gpu)
|
||||||
- **[Huggingface Cpu](./inline_huggingface-cpu)** - Inline provider
|
- [Torchtune-Cpu](./inline_torchtune-cpu)
|
||||||
- **[Huggingface Gpu](./inline_huggingface-gpu)** - Inline provider
|
- [Torchtune-Gpu](./inline_torchtune-gpu)
|
||||||
- **[Torchtune](./inline_torchtune)** - Inline provider
|
- [Remote - Nvidia](./remote_nvidia)
|
||||||
- **[Torchtune Cpu](./inline_torchtune-cpu)** - Inline provider
|
|
||||||
- **[Torchtune Gpu](./inline_torchtune-gpu)** - Inline provider
|
|
||||||
- **[Nvidia](./remote_nvidia)** - Remote provider
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: HuggingFace-based post-training provider for fine-tuning models using
|
description: "HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem."
|
||||||
the HuggingFace ecosystem
|
sidebar_label: Huggingface-Gpu
|
||||||
sidebar_label: Huggingface Gpu
|
|
||||||
sidebar_position: 4
|
|
||||||
title: inline::huggingface-gpu
|
title: inline::huggingface-gpu
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -17,10 +15,13 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `device` | `<class 'str'>` | No | cuda | |
|
| `device` | `<class 'str'>` | No | cuda | |
|
||||||
| `distributed_backend` | `Literal['fsdp', 'deepspeed']` | No | | |
|
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
|
||||||
| `checkpoint_format` | `Literal['full_state', 'huggingface']` | No | huggingface | |
|
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
|
||||||
| `chat_template` | `<class 'str'>` | No | `<\|user\|>{input}<\|assistant\|>{output}` | |
|
| `chat_template` | `<class 'str'>` | No | <|user|>
|
||||||
| `model_specific_config` | `<class 'dict'>` | No | `{'trust_remote_code': True, 'attn_implementation': 'sdpa'}` | |
|
{input}
|
||||||
|
<|assistant|>
|
||||||
|
{output} | |
|
||||||
|
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
|
||||||
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
|
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
|
||||||
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
|
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
|
||||||
| `save_total_limit` | `<class 'int'>` | No | 3 | |
|
| `save_total_limit` | `<class 'int'>` | No | 3 | |
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: TorchTune-based post-training provider for fine-tuning and optimizing
|
description: "TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework."
|
||||||
models using Meta's TorchTune framework
|
sidebar_label: Torchtune-Cpu
|
||||||
sidebar_label: Torchtune Cpu
|
|
||||||
sidebar_position: 6
|
|
||||||
title: inline::torchtune-cpu
|
title: inline::torchtune-cpu
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: TorchTune-based post-training provider for fine-tuning and optimizing
|
description: "TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework."
|
||||||
models using Meta's TorchTune framework
|
sidebar_label: Torchtune-Gpu
|
||||||
sidebar_label: Torchtune Gpu
|
|
||||||
sidebar_position: 7
|
|
||||||
title: inline::torchtune-gpu
|
title: inline::torchtune-gpu
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform
|
description: "NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform."
|
||||||
sidebar_label: Nvidia
|
sidebar_label: Remote - Nvidia
|
||||||
sidebar_position: 8
|
|
||||||
title: remote::nvidia
|
title: remote::nvidia
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the safety API
|
sidebar_label: Safety
|
||||||
sidebar_label: Overview
|
|
||||||
sidebar_position: 1
|
|
||||||
title: Safety
|
title: Safety
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -13,9 +11,9 @@ This section contains documentation for all available providers for the **safety
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Code Scanner](./inline_code-scanner)** - Inline provider
|
- [Code-Scanner](./inline_code-scanner)
|
||||||
- **[Llama Guard](./inline_llama-guard)** - Inline provider
|
- [Llama-Guard](./inline_llama-guard)
|
||||||
- **[Prompt Guard](./inline_prompt-guard)** - Inline provider
|
- [Prompt-Guard](./inline_prompt-guard)
|
||||||
- **[Bedrock](./remote_bedrock)** - Remote provider
|
- [Remote - Bedrock](./remote_bedrock)
|
||||||
- **[Nvidia](./remote_nvidia)** - Remote provider
|
- [Remote - Nvidia](./remote_nvidia)
|
||||||
- **[Sambanova](./remote_sambanova)** - Remote provider
|
- [Remote - Sambanova](./remote_sambanova)
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Code Scanner safety provider for detecting security vulnerabilities and
|
description: "Code Scanner safety provider for detecting security vulnerabilities and unsafe code patterns."
|
||||||
unsafe code patterns
|
sidebar_label: Code-Scanner
|
||||||
sidebar_label: Code Scanner
|
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::code-scanner
|
title: inline::code-scanner
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -12,10 +10,6 @@ title: inline::code-scanner
|
||||||
|
|
||||||
Code Scanner safety provider for detecting security vulnerabilities and unsafe code patterns.
|
Code Scanner safety provider for detecting security vulnerabilities and unsafe code patterns.
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
No configuration options available.
|
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Llama Guard safety provider for content moderation and safety filtering
|
description: "Llama Guard safety provider for content moderation and safety filtering using Meta's Llama Guard model."
|
||||||
using Meta's Llama Guard model
|
sidebar_label: Llama-Guard
|
||||||
sidebar_label: Llama Guard
|
|
||||||
sidebar_position: 3
|
|
||||||
title: inline::llama-guard
|
title: inline::llama-guard
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Prompt Guard safety provider for detecting and filtering unsafe prompts
|
description: "Prompt Guard safety provider for detecting and filtering unsafe prompts and content."
|
||||||
and content
|
sidebar_label: Prompt-Guard
|
||||||
sidebar_label: Prompt Guard
|
|
||||||
sidebar_position: 4
|
|
||||||
title: inline::prompt-guard
|
title: inline::prompt-guard
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: AWS Bedrock safety provider for content moderation using AWS's safety
|
description: "AWS Bedrock safety provider for content moderation using AWS's safety services."
|
||||||
services
|
sidebar_label: Remote - Bedrock
|
||||||
sidebar_label: Bedrock
|
|
||||||
sidebar_position: 5
|
|
||||||
title: remote::bedrock
|
title: remote::bedrock
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: NVIDIA's safety provider for content moderation and safety filtering
|
description: "NVIDIA's safety provider for content moderation and safety filtering."
|
||||||
sidebar_label: Nvidia
|
sidebar_label: Remote - Nvidia
|
||||||
sidebar_position: 6
|
|
||||||
title: remote::nvidia
|
title: remote::nvidia
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: SambaNova's safety provider for content moderation and safety filtering
|
description: "SambaNova's safety provider for content moderation and safety filtering."
|
||||||
sidebar_label: Sambanova
|
sidebar_label: Remote - Sambanova
|
||||||
sidebar_position: 7
|
|
||||||
title: remote::sambanova
|
title: remote::sambanova
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the scoring API
|
sidebar_label: Scoring
|
||||||
sidebar_label: Overview
|
|
||||||
sidebar_position: 1
|
|
||||||
title: Scoring
|
title: Scoring
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -13,6 +11,6 @@ This section contains documentation for all available providers for the **scorin
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Basic](./inline_basic)** - Inline provider
|
- [Basic](./inline_basic)
|
||||||
- **[Braintrust](./inline_braintrust)** - Inline provider
|
- [Braintrust](./inline_braintrust)
|
||||||
- **[Llm As Judge](./inline_llm-as-judge)** - Inline provider
|
- [Llm-As-Judge](./inline_llm-as-judge)
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Basic scoring provider for simple evaluation metrics and scoring functions
|
description: "Basic scoring provider for simple evaluation metrics and scoring functions."
|
||||||
sidebar_label: Basic
|
sidebar_label: Basic
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::basic
|
title: inline::basic
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -11,10 +10,6 @@ title: inline::basic
|
||||||
|
|
||||||
Basic scoring provider for simple evaluation metrics and scoring functions.
|
Basic scoring provider for simple evaluation metrics and scoring functions.
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
No configuration options available.
|
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Braintrust scoring provider for evaluation and scoring using the Braintrust
|
description: "Braintrust scoring provider for evaluation and scoring using the Braintrust platform."
|
||||||
platform
|
|
||||||
sidebar_label: Braintrust
|
sidebar_label: Braintrust
|
||||||
sidebar_position: 3
|
|
||||||
title: inline::braintrust
|
title: inline::braintrust
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: LLM-as-judge scoring provider that uses language models to evaluate and
|
description: "LLM-as-judge scoring provider that uses language models to evaluate and score responses."
|
||||||
score responses
|
sidebar_label: Llm-As-Judge
|
||||||
sidebar_label: Llm As Judge
|
|
||||||
sidebar_position: 4
|
|
||||||
title: inline::llm-as-judge
|
title: inline::llm-as-judge
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -12,10 +10,6 @@ title: inline::llm-as-judge
|
||||||
|
|
||||||
LLM-as-judge scoring provider that uses language models to evaluate and score responses.
|
LLM-as-judge scoring provider that uses language models to evaluate and score responses.
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
No configuration options available.
|
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the telemetry API
|
sidebar_label: Telemetry
|
||||||
sidebar_label: Overview
|
|
||||||
sidebar_position: 1
|
|
||||||
title: Telemetry
|
title: Telemetry
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -13,4 +11,4 @@ This section contains documentation for all available providers for the **teleme
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Meta Reference](./inline_meta-reference)** - Inline provider
|
- [Meta-Reference](./inline_meta-reference)
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Meta's reference implementation of telemetry and observability using
|
description: "Meta's reference implementation of telemetry and observability using OpenTelemetry."
|
||||||
OpenTelemetry
|
sidebar_label: Meta-Reference
|
||||||
sidebar_label: Meta Reference
|
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::meta-reference
|
title: inline::meta-reference
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -16,9 +14,9 @@ Meta's reference implementation of telemetry and observability using OpenTelemet
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `otel_exporter_otlp_endpoint` | `str or None` | No | | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. |
|
| `otel_exporter_otlp_endpoint` | `str \| None` | No | | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. |
|
||||||
| `service_name` | `<class 'str'>` | No | | The service name to use for telemetry |
|
| `service_name` | `<class 'str'>` | No | | The service name to use for telemetry |
|
||||||
| `sinks` | `list[TelemetrySink]` | No | `[CONSOLE, SQLITE]` | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) |
|
| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [<TelemetrySink.CONSOLE: 'console'>, <TelemetrySink.SQLITE: 'sqlite'>] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) |
|
||||||
| `sqlite_db_path` | `<class 'str'>` | No | ~/.llama/runtime/trace_store.db | The path to the SQLite database to use for storing traces |
|
| `sqlite_db_path` | `<class 'str'>` | No | ~/.llama/runtime/trace_store.db | The path to the SQLite database to use for storing traces |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the tool_runtime API
|
sidebar_label: Tool Runtime
|
||||||
sidebar_label: Overview
|
|
||||||
sidebar_position: 1
|
|
||||||
title: Tool_Runtime
|
title: Tool_Runtime
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -13,9 +11,9 @@ This section contains documentation for all available providers for the **tool_r
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Rag Runtime](./inline_rag-runtime)** - Inline provider
|
- [Rag-Runtime](./inline_rag-runtime)
|
||||||
- **[Bing Search](./remote_bing-search)** - Remote provider
|
- [Remote - Bing-Search](./remote_bing-search)
|
||||||
- **[Brave Search](./remote_brave-search)** - Remote provider
|
- [Remote - Brave-Search](./remote_brave-search)
|
||||||
- **[Model Context Protocol](./remote_model-context-protocol)** - Remote provider
|
- [Remote - Model-Context-Protocol](./remote_model-context-protocol)
|
||||||
- **[Tavily Search](./remote_tavily-search)** - Remote provider
|
- [Remote - Tavily-Search](./remote_tavily-search)
|
||||||
- **[Wolfram Alpha](./remote_wolfram-alpha)** - Remote provider
|
- [Remote - Wolfram-Alpha](./remote_wolfram-alpha)
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: RAG (Retrieval-Augmented Generation) tool runtime for document ingestion,
|
description: "RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search."
|
||||||
chunking, and semantic search
|
sidebar_label: Rag-Runtime
|
||||||
sidebar_label: Rag Runtime
|
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::rag-runtime
|
title: inline::rag-runtime
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -12,10 +10,6 @@ title: inline::rag-runtime
|
||||||
|
|
||||||
RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.
|
RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
No configuration options available.
|
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Bing Search tool for web search capabilities using Microsoft's search
|
description: "Bing Search tool for web search capabilities using Microsoft's search engine."
|
||||||
engine
|
sidebar_label: Remote - Bing-Search
|
||||||
sidebar_label: Bing Search
|
|
||||||
sidebar_position: 3
|
|
||||||
title: remote::bing-search
|
title: remote::bing-search
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Brave Search tool for web search capabilities with privacy-focused results
|
description: "Brave Search tool for web search capabilities with privacy-focused results."
|
||||||
sidebar_label: Brave Search
|
sidebar_label: Remote - Brave-Search
|
||||||
sidebar_position: 4
|
|
||||||
title: remote::brave-search
|
title: remote::brave-search
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Model Context Protocol (MCP) tool for standardized tool calling and context
|
description: "Model Context Protocol (MCP) tool for standardized tool calling and context management."
|
||||||
management
|
sidebar_label: Remote - Model-Context-Protocol
|
||||||
sidebar_label: Model Context Protocol
|
|
||||||
sidebar_position: 5
|
|
||||||
title: remote::model-context-protocol
|
title: remote::model-context-protocol
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -12,10 +10,6 @@ title: remote::model-context-protocol
|
||||||
|
|
||||||
Model Context Protocol (MCP) tool for standardized tool calling and context management.
|
Model Context Protocol (MCP) tool for standardized tool calling and context management.
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
No configuration options available.
|
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Tavily Search tool for AI-optimized web search with structured results
|
description: "Tavily Search tool for AI-optimized web search with structured results."
|
||||||
sidebar_label: Tavily Search
|
sidebar_label: Remote - Tavily-Search
|
||||||
sidebar_position: 6
|
|
||||||
title: remote::tavily-search
|
title: remote::tavily-search
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Wolfram Alpha tool for computational knowledge and mathematical calculations
|
description: "Wolfram Alpha tool for computational knowledge and mathematical calculations."
|
||||||
sidebar_label: Wolfram Alpha
|
sidebar_label: Remote - Wolfram-Alpha
|
||||||
sidebar_position: 7
|
|
||||||
title: remote::wolfram-alpha
|
title: remote::wolfram-alpha
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
---
|
---
|
||||||
description: Available providers for the vector_io API
|
sidebar_label: Vector Io
|
||||||
sidebar_label: Overview
|
|
||||||
sidebar_position: 1
|
|
||||||
title: Vector_Io
|
title: Vector_Io
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -13,15 +11,15 @@ This section contains documentation for all available providers for the **vector
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
||||||
- **[Chromadb](./inline_chromadb)** - Inline provider
|
- [Chromadb](./inline_chromadb)
|
||||||
- **[Faiss](./inline_faiss)** - Inline provider
|
- [Faiss](./inline_faiss)
|
||||||
- **[Meta Reference](./inline_meta-reference)** - Inline provider
|
- [Meta-Reference](./inline_meta-reference)
|
||||||
- **[Milvus](./inline_milvus)** - Inline provider
|
- [Milvus](./inline_milvus)
|
||||||
- **[Qdrant](./inline_qdrant)** - Inline provider
|
- [Qdrant](./inline_qdrant)
|
||||||
- **[SQLite-Vec](./inline_sqlite-vec)** - Inline provider
|
- [Sqlite-Vec](./inline_sqlite-vec)
|
||||||
- **[SQLite-Vec](./inline_sqlite_vec)** - Inline provider
|
- [Sqlite Vec](./inline_sqlite_vec)
|
||||||
- **[Chromadb](./remote_chromadb)** - Remote provider
|
- [Remote - Chromadb](./remote_chromadb)
|
||||||
- **[Milvus](./remote_milvus)** - Remote provider
|
- [Remote - Milvus](./remote_milvus)
|
||||||
- **[Pgvector](./remote_pgvector)** - Remote provider
|
- [Remote - Pgvector](./remote_pgvector)
|
||||||
- **[Qdrant](./remote_qdrant)** - Remote provider
|
- [Remote - Qdrant](./remote_qdrant)
|
||||||
- **[Weaviate](./remote_weaviate)** - Remote provider
|
- [Remote - Weaviate](./remote_weaviate)
|
||||||
|
|
|
@ -1,7 +1,37 @@
|
||||||
---
|
---
|
||||||
description: '[Chroma](https://www'
|
description: |
|
||||||
|
[Chroma](https://www.trychroma.com/) is an inline and remote vector
|
||||||
|
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
|
||||||
|
That means you're not limited to storing vectors in memory or in a separate service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
Chroma supports:
|
||||||
|
- Store embeddings and their metadata
|
||||||
|
- Vector search
|
||||||
|
- Full-text search
|
||||||
|
- Document storage
|
||||||
|
- Metadata filtering
|
||||||
|
- Multi-modal retrieval
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Chrome in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use chroma.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install chroma using pip:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install chromadb
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
|
||||||
sidebar_label: Chromadb
|
sidebar_label: Chromadb
|
||||||
sidebar_position: 2
|
|
||||||
title: inline::chromadb
|
title: inline::chromadb
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -9,10 +39,41 @@ title: inline::chromadb
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
[Chroma](https://www.trychroma.com/) is an inline and remote vector
|
[Chroma](https://www.trychroma.com/) is an inline and remote vector
|
||||||
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
|
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
|
||||||
That means you're not limited to storing vectors in memory or in a separate service.
|
That means you're not limited to storing vectors in memory or in a separate service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
Chroma supports:
|
||||||
|
- Store embeddings and their metadata
|
||||||
|
- Vector search
|
||||||
|
- Full-text search
|
||||||
|
- Document storage
|
||||||
|
- Metadata filtering
|
||||||
|
- Multi-modal retrieval
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Chrome in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use chroma.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install chroma using pip:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install chromadb
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
|
@ -1,7 +1,46 @@
|
||||||
---
|
---
|
||||||
description: '[Faiss](https://github'
|
description: |
|
||||||
|
[Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It
|
||||||
|
allows you to store and query vectors directly in memory.
|
||||||
|
That means you'll get fast and efficient vector retrieval.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Lightweight and easy to use
|
||||||
|
- Fully integrated with Llama Stack
|
||||||
|
- GPU support
|
||||||
|
- **Vector search** - FAISS supports pure vector similarity search using embeddings
|
||||||
|
|
||||||
|
## Search Modes
|
||||||
|
|
||||||
|
**Supported:**
|
||||||
|
- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
|
||||||
|
|
||||||
|
**Not Supported:**
|
||||||
|
- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
|
||||||
|
- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
|
||||||
|
|
||||||
|
> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Faiss in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use Faiss.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install Faiss using pip:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install faiss-cpu
|
||||||
|
```
|
||||||
|
## Documentation
|
||||||
|
See [Faiss' documentation](https://faiss.ai/) or the [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki) for
|
||||||
|
more details about Faiss in general.
|
||||||
sidebar_label: Faiss
|
sidebar_label: Faiss
|
||||||
sidebar_position: 3
|
|
||||||
title: inline::faiss
|
title: inline::faiss
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -9,10 +48,49 @@ title: inline::faiss
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
[Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It
|
[Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It
|
||||||
allows you to store and query vectors directly in memory.
|
allows you to store and query vectors directly in memory.
|
||||||
That means you'll get fast and efficient vector retrieval.
|
That means you'll get fast and efficient vector retrieval.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Lightweight and easy to use
|
||||||
|
- Fully integrated with Llama Stack
|
||||||
|
- GPU support
|
||||||
|
- **Vector search** - FAISS supports pure vector similarity search using embeddings
|
||||||
|
|
||||||
|
## Search Modes
|
||||||
|
|
||||||
|
**Supported:**
|
||||||
|
- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
|
||||||
|
|
||||||
|
**Not Supported:**
|
||||||
|
- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
|
||||||
|
- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
|
||||||
|
|
||||||
|
> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Faiss in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use Faiss.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install Faiss using pip:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install faiss-cpu
|
||||||
|
```
|
||||||
|
## Documentation
|
||||||
|
See [Faiss' documentation](https://faiss.ai/) or the [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki) for
|
||||||
|
more details about Faiss in general.
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Meta's reference implementation of a vector database
|
description: "Meta's reference implementation of a vector database."
|
||||||
sidebar_label: Meta Reference
|
sidebar_label: Meta-Reference
|
||||||
sidebar_position: 4
|
|
||||||
title: inline::meta-reference
|
title: inline::meta-reference
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -24,3 +23,9 @@ kvstore:
|
||||||
type: sqlite
|
type: sqlite
|
||||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db
|
||||||
```
|
```
|
||||||
|
## Deprecation Notice
|
||||||
|
|
||||||
|
:::warning
|
||||||
|
Please use the `inline::faiss` provider instead.
|
||||||
|
:::
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Please refer to the remote provider documentation
|
description: "Please refer to the remote provider documentation."
|
||||||
sidebar_label: Milvus
|
sidebar_label: Milvus
|
||||||
sidebar_position: 5
|
|
||||||
title: inline::milvus
|
title: inline::milvus
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -9,8 +8,10 @@ title: inline::milvus
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
Please refer to the remote provider documentation.
|
Please refer to the remote provider documentation.
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
|
@ -1,7 +1,47 @@
|
||||||
---
|
---
|
||||||
description: '[Qdrant](https://qdrant'
|
description: |
|
||||||
|
[Qdrant](https://qdrant.tech/documentation/) is an inline and remote vector database provider for Llama Stack. It
|
||||||
|
allows you to store and query vectors directly in memory.
|
||||||
|
That means you'll get fast and efficient vector retrieval.
|
||||||
|
|
||||||
|
> By default, Qdrant stores vectors in RAM, delivering incredibly fast access for datasets that fit comfortably in
|
||||||
|
> memory. But when your dataset exceeds RAM capacity, Qdrant offers Memmap as an alternative.
|
||||||
|
>
|
||||||
|
> \[[An Introduction to Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database/)\]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Lightweight and easy to use
|
||||||
|
- Fully integrated with Llama Stack
|
||||||
|
- Apache 2.0 license terms
|
||||||
|
- Store embeddings and their metadata
|
||||||
|
- Supports search by
|
||||||
|
[Keyword](https://qdrant.tech/articles/qdrant-introduces-full-text-filters-and-indexes/)
|
||||||
|
and [Hybrid](https://qdrant.tech/articles/hybrid-search/#building-a-hybrid-search-system-in-qdrant) search
|
||||||
|
- [Multilingual and Multimodal retrieval](https://qdrant.tech/documentation/multimodal-search/)
|
||||||
|
- [Medatata filtering](https://qdrant.tech/articles/vector-search-filtering/)
|
||||||
|
- [GPU support](https://qdrant.tech/documentation/guides/running-with-gpu/)
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Qdrant in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use Qdrant.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install Qdrant using docker:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker pull qdrant/qdrant
|
||||||
|
```
|
||||||
|
## Documentation
|
||||||
|
See the [Qdrant documentation](https://qdrant.tech/documentation/) for more details about Qdrant in general.
|
||||||
sidebar_label: Qdrant
|
sidebar_label: Qdrant
|
||||||
sidebar_position: 6
|
|
||||||
title: inline::qdrant
|
title: inline::qdrant
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -9,6 +49,7 @@ title: inline::qdrant
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
[Qdrant](https://qdrant.tech/documentation/) is an inline and remote vector database provider for Llama Stack. It
|
[Qdrant](https://qdrant.tech/documentation/) is an inline and remote vector database provider for Llama Stack. It
|
||||||
allows you to store and query vectors directly in memory.
|
allows you to store and query vectors directly in memory.
|
||||||
That means you'll get fast and efficient vector retrieval.
|
That means you'll get fast and efficient vector retrieval.
|
||||||
|
@ -18,6 +59,40 @@ That means you'll get fast and efficient vector retrieval.
|
||||||
>
|
>
|
||||||
> \[[An Introduction to Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database/)\]
|
> \[[An Introduction to Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database/)\]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Lightweight and easy to use
|
||||||
|
- Fully integrated with Llama Stack
|
||||||
|
- Apache 2.0 license terms
|
||||||
|
- Store embeddings and their metadata
|
||||||
|
- Supports search by
|
||||||
|
[Keyword](https://qdrant.tech/articles/qdrant-introduces-full-text-filters-and-indexes/)
|
||||||
|
and [Hybrid](https://qdrant.tech/articles/hybrid-search/#building-a-hybrid-search-system-in-qdrant) search
|
||||||
|
- [Multilingual and Multimodal retrieval](https://qdrant.tech/documentation/multimodal-search/)
|
||||||
|
- [Medatata filtering](https://qdrant.tech/articles/vector-search-filtering/)
|
||||||
|
- [GPU support](https://qdrant.tech/documentation/guides/running-with-gpu/)
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Qdrant in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use Qdrant.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install Qdrant using docker:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker pull qdrant/qdrant
|
||||||
|
```
|
||||||
|
## Documentation
|
||||||
|
See the [Qdrant documentation](https://qdrant.tech/documentation/) for more details about Qdrant in general.
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
|
@ -1,7 +1,202 @@
|
||||||
---
|
---
|
||||||
description: '[SQLite-Vec](https://github'
|
description: |
|
||||||
sidebar_label: SQLite-Vec
|
[SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It
|
||||||
sidebar_position: 7
|
allows you to store and query vectors directly within an SQLite database.
|
||||||
|
That means you're not limited to storing vectors in memory or in a separate service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Lightweight and easy to use
|
||||||
|
- Fully integrated with Llama Stacks
|
||||||
|
- Uses disk-based storage for persistence, allowing for larger vector storage
|
||||||
|
|
||||||
|
### Comparison to Faiss
|
||||||
|
|
||||||
|
The choice between Faiss and sqlite-vec should be made based on the needs of your application,
|
||||||
|
as they have different strengths.
|
||||||
|
|
||||||
|
#### Choosing the Right Provider
|
||||||
|
|
||||||
|
Scenario | Recommended Tool | Reason
|
||||||
|
-- |-----------------| --
|
||||||
|
Online Analytical Processing (OLAP) | Faiss | Fast, in-memory searches
|
||||||
|
Online Transaction Processing (OLTP) | sqlite-vec | Frequent writes and reads
|
||||||
|
Frequent writes | sqlite-vec | Efficient disk-based storage and incremental indexing
|
||||||
|
Large datasets | sqlite-vec | Disk-based storage for larger vector storage
|
||||||
|
Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
|
||||||
|
|
||||||
|
#### Empirical Example
|
||||||
|
|
||||||
|
Consider the histogram below in which 10,000 randomly generated strings were inserted
|
||||||
|
in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
|
||||||
|
|
||||||
|
```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
|
||||||
|
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||||
|
:width: 400px
|
||||||
|
```
|
||||||
|
|
||||||
|
You will notice that the average write time for `sqlite-vec` was 788ms, compared to
|
||||||
|
47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
|
||||||
|
uniformly spread across the [1500, 100000] interval.
|
||||||
|
|
||||||
|
Looking at each individual write in the order that the documents are inserted you'll see the increase in
|
||||||
|
write speed as Faiss reindexes the vectors after each write.
|
||||||
|
```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
|
||||||
|
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||||
|
:width: 400px
|
||||||
|
```
|
||||||
|
|
||||||
|
In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
|
||||||
|
The modes of the two distributions highlight the differences much further where Faiss
|
||||||
|
will likely yield faster read performance.
|
||||||
|
|
||||||
|
```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
|
||||||
|
:alt: Comparison of SQLite-Vec and Faiss read times
|
||||||
|
:width: 400px
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use sqlite-vec in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use SQLite-Vec.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
The SQLite-vec provider supports three search modes:
|
||||||
|
|
||||||
|
1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings.
|
||||||
|
2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5.
|
||||||
|
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates.
|
||||||
|
|
||||||
|
Example with hybrid search:
|
||||||
|
```python
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Using RRF ranker
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={
|
||||||
|
"mode": "hybrid",
|
||||||
|
"max_chunks": 3,
|
||||||
|
"score_threshold": 0.7,
|
||||||
|
"ranker": {"type": "rrf", "impact_factor": 60.0},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Using weighted ranker
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={
|
||||||
|
"mode": "hybrid",
|
||||||
|
"max_chunks": 3,
|
||||||
|
"score_threshold": 0.7,
|
||||||
|
"ranker": {"type": "weighted", "alpha": 0.7}, # 70% vector, 30% keyword
|
||||||
|
},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Example with explicit vector search:
|
||||||
|
```python
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Example with keyword search:
|
||||||
|
```python
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Supported Search Modes
|
||||||
|
|
||||||
|
The SQLite vector store supports three search modes:
|
||||||
|
|
||||||
|
1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks
|
||||||
|
2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks
|
||||||
|
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker
|
||||||
|
|
||||||
|
### Hybrid Search
|
||||||
|
|
||||||
|
Hybrid search combines the strengths of both vector and keyword search by:
|
||||||
|
- Computing vector similarity scores
|
||||||
|
- Computing keyword match scores
|
||||||
|
- Using a ranker to combine these scores
|
||||||
|
|
||||||
|
Two ranker types are supported:
|
||||||
|
|
||||||
|
1. **RRF (Reciprocal Rank Fusion)**:
|
||||||
|
- Combines ranks from both vector and keyword results
|
||||||
|
- Uses an impact factor (default: 60.0) to control the weight of higher-ranked results
|
||||||
|
- Good for balancing between vector and keyword results
|
||||||
|
- The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks
|
||||||
|
|
||||||
|
2. **Weighted**:
|
||||||
|
- Linearly combines normalized vector and keyword scores
|
||||||
|
- Uses an alpha parameter (0-1) to control the blend:
|
||||||
|
- alpha=0: Only use keyword scores
|
||||||
|
- alpha=1: Only use vector scores
|
||||||
|
- alpha=0.5: Equal weight to both (default)
|
||||||
|
|
||||||
|
Example using RAGQueryConfig with different search modes:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
|
||||||
|
|
||||||
|
# Vector search
|
||||||
|
config = RAGQueryConfig(mode="vector", max_chunks=5)
|
||||||
|
|
||||||
|
# Keyword search
|
||||||
|
config = RAGQueryConfig(mode="keyword", max_chunks=5)
|
||||||
|
|
||||||
|
# Hybrid search with custom RRF ranker
|
||||||
|
config = RAGQueryConfig(
|
||||||
|
mode="hybrid",
|
||||||
|
max_chunks=5,
|
||||||
|
ranker=RRFRanker(impact_factor=50.0), # Custom impact factor
|
||||||
|
)
|
||||||
|
|
||||||
|
# Hybrid search with weighted ranker
|
||||||
|
config = RAGQueryConfig(
|
||||||
|
mode="hybrid",
|
||||||
|
max_chunks=5,
|
||||||
|
ranker=WeightedRanker(alpha=0.7), # 70% vector, 30% keyword
|
||||||
|
)
|
||||||
|
|
||||||
|
# Hybrid search with default RRF ranker
|
||||||
|
config = RAGQueryConfig(
|
||||||
|
mode="hybrid", max_chunks=5
|
||||||
|
) # Will use RRF with impact_factor=60.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install SQLite-Vec using pip:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install sqlite-vec
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
|
||||||
|
|
||||||
|
[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759).
|
||||||
|
sidebar_label: Sqlite-Vec
|
||||||
title: inline::sqlite-vec
|
title: inline::sqlite-vec
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -9,10 +204,205 @@ title: inline::sqlite-vec
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
[SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It
|
[SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It
|
||||||
allows you to store and query vectors directly within an SQLite database.
|
allows you to store and query vectors directly within an SQLite database.
|
||||||
That means you're not limited to storing vectors in memory or in a separate service.
|
That means you're not limited to storing vectors in memory or in a separate service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Lightweight and easy to use
|
||||||
|
- Fully integrated with Llama Stacks
|
||||||
|
- Uses disk-based storage for persistence, allowing for larger vector storage
|
||||||
|
|
||||||
|
### Comparison to Faiss
|
||||||
|
|
||||||
|
The choice between Faiss and sqlite-vec should be made based on the needs of your application,
|
||||||
|
as they have different strengths.
|
||||||
|
|
||||||
|
#### Choosing the Right Provider
|
||||||
|
|
||||||
|
Scenario | Recommended Tool | Reason
|
||||||
|
-- |-----------------| --
|
||||||
|
Online Analytical Processing (OLAP) | Faiss | Fast, in-memory searches
|
||||||
|
Online Transaction Processing (OLTP) | sqlite-vec | Frequent writes and reads
|
||||||
|
Frequent writes | sqlite-vec | Efficient disk-based storage and incremental indexing
|
||||||
|
Large datasets | sqlite-vec | Disk-based storage for larger vector storage
|
||||||
|
Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
|
||||||
|
|
||||||
|
#### Empirical Example
|
||||||
|
|
||||||
|
Consider the histogram below in which 10,000 randomly generated strings were inserted
|
||||||
|
in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
|
||||||
|
|
||||||
|
```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
|
||||||
|
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||||
|
:width: 400px
|
||||||
|
```
|
||||||
|
|
||||||
|
You will notice that the average write time for `sqlite-vec` was 788ms, compared to
|
||||||
|
47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
|
||||||
|
uniformly spread across the [1500, 100000] interval.
|
||||||
|
|
||||||
|
Looking at each individual write in the order that the documents are inserted you'll see the increase in
|
||||||
|
write speed as Faiss reindexes the vectors after each write.
|
||||||
|
```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
|
||||||
|
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||||
|
:width: 400px
|
||||||
|
```
|
||||||
|
|
||||||
|
In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
|
||||||
|
The modes of the two distributions highlight the differences much further where Faiss
|
||||||
|
will likely yield faster read performance.
|
||||||
|
|
||||||
|
```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
|
||||||
|
:alt: Comparison of SQLite-Vec and Faiss read times
|
||||||
|
:width: 400px
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use sqlite-vec in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use SQLite-Vec.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
The SQLite-vec provider supports three search modes:
|
||||||
|
|
||||||
|
1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings.
|
||||||
|
2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5.
|
||||||
|
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates.
|
||||||
|
|
||||||
|
Example with hybrid search:
|
||||||
|
```python
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Using RRF ranker
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={
|
||||||
|
"mode": "hybrid",
|
||||||
|
"max_chunks": 3,
|
||||||
|
"score_threshold": 0.7,
|
||||||
|
"ranker": {"type": "rrf", "impact_factor": 60.0},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Using weighted ranker
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={
|
||||||
|
"mode": "hybrid",
|
||||||
|
"max_chunks": 3,
|
||||||
|
"score_threshold": 0.7,
|
||||||
|
"ranker": {"type": "weighted", "alpha": 0.7}, # 70% vector, 30% keyword
|
||||||
|
},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Example with explicit vector search:
|
||||||
|
```python
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Example with keyword search:
|
||||||
|
```python
|
||||||
|
response = await vector_io.query_chunks(
|
||||||
|
vector_db_id="my_db",
|
||||||
|
query="your query here",
|
||||||
|
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Supported Search Modes
|
||||||
|
|
||||||
|
The SQLite vector store supports three search modes:
|
||||||
|
|
||||||
|
1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks
|
||||||
|
2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks
|
||||||
|
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker
|
||||||
|
|
||||||
|
### Hybrid Search
|
||||||
|
|
||||||
|
Hybrid search combines the strengths of both vector and keyword search by:
|
||||||
|
- Computing vector similarity scores
|
||||||
|
- Computing keyword match scores
|
||||||
|
- Using a ranker to combine these scores
|
||||||
|
|
||||||
|
Two ranker types are supported:
|
||||||
|
|
||||||
|
1. **RRF (Reciprocal Rank Fusion)**:
|
||||||
|
- Combines ranks from both vector and keyword results
|
||||||
|
- Uses an impact factor (default: 60.0) to control the weight of higher-ranked results
|
||||||
|
- Good for balancing between vector and keyword results
|
||||||
|
- The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks
|
||||||
|
|
||||||
|
2. **Weighted**:
|
||||||
|
- Linearly combines normalized vector and keyword scores
|
||||||
|
- Uses an alpha parameter (0-1) to control the blend:
|
||||||
|
- alpha=0: Only use keyword scores
|
||||||
|
- alpha=1: Only use vector scores
|
||||||
|
- alpha=0.5: Equal weight to both (default)
|
||||||
|
|
||||||
|
Example using RAGQueryConfig with different search modes:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
|
||||||
|
|
||||||
|
# Vector search
|
||||||
|
config = RAGQueryConfig(mode="vector", max_chunks=5)
|
||||||
|
|
||||||
|
# Keyword search
|
||||||
|
config = RAGQueryConfig(mode="keyword", max_chunks=5)
|
||||||
|
|
||||||
|
# Hybrid search with custom RRF ranker
|
||||||
|
config = RAGQueryConfig(
|
||||||
|
mode="hybrid",
|
||||||
|
max_chunks=5,
|
||||||
|
ranker=RRFRanker(impact_factor=50.0), # Custom impact factor
|
||||||
|
)
|
||||||
|
|
||||||
|
# Hybrid search with weighted ranker
|
||||||
|
config = RAGQueryConfig(
|
||||||
|
mode="hybrid",
|
||||||
|
max_chunks=5,
|
||||||
|
ranker=WeightedRanker(alpha=0.7), # 70% vector, 30% keyword
|
||||||
|
)
|
||||||
|
|
||||||
|
# Hybrid search with default RRF ranker
|
||||||
|
config = RAGQueryConfig(
|
||||||
|
mode="hybrid", max_chunks=5
|
||||||
|
) # Will use RRF with impact_factor=60.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install SQLite-Vec using pip:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install sqlite-vec
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
|
||||||
|
|
||||||
|
[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759).
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
35
docs/docs/providers/vector_io/inline_sqlite_vec.mdx
Normal file
35
docs/docs/providers/vector_io/inline_sqlite_vec.mdx
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
---
|
||||||
|
description: "Please refer to the sqlite-vec provider documentation."
|
||||||
|
sidebar_label: Sqlite Vec
|
||||||
|
title: inline::sqlite_vec
|
||||||
|
---
|
||||||
|
|
||||||
|
# inline::sqlite_vec
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
|
||||||
|
Please refer to the sqlite-vec provider documentation.
|
||||||
|
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
| Field | Type | Required | Default | Description |
|
||||||
|
|-------|------|----------|---------|-------------|
|
||||||
|
| `db_path` | `<class 'str'>` | No | | Path to the SQLite database file |
|
||||||
|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
|
||||||
|
|
||||||
|
## Sample Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db
|
||||||
|
```
|
||||||
|
## Deprecation Notice
|
||||||
|
|
||||||
|
:::warning
|
||||||
|
Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
|
||||||
|
:::
|
||||||
|
|
|
@ -1,7 +1,37 @@
|
||||||
---
|
---
|
||||||
description: '[Chroma](https://www'
|
description: |
|
||||||
sidebar_label: Chromadb
|
[Chroma](https://www.trychroma.com/) is an inline and remote vector
|
||||||
sidebar_position: 9
|
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
|
||||||
|
That means you're not limited to storing vectors in memory or in a separate service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
Chroma supports:
|
||||||
|
- Store embeddings and their metadata
|
||||||
|
- Vector search
|
||||||
|
- Full-text search
|
||||||
|
- Document storage
|
||||||
|
- Metadata filtering
|
||||||
|
- Multi-modal retrieval
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Chrome in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use chroma.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install chroma using pip:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install chromadb
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
|
||||||
|
sidebar_label: Remote - Chromadb
|
||||||
title: remote::chromadb
|
title: remote::chromadb
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -9,10 +39,40 @@ title: remote::chromadb
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
[Chroma](https://www.trychroma.com/) is an inline and remote vector
|
[Chroma](https://www.trychroma.com/) is an inline and remote vector
|
||||||
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
|
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
|
||||||
That means you're not limited to storing vectors in memory or in a separate service.
|
That means you're not limited to storing vectors in memory or in a separate service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
Chroma supports:
|
||||||
|
- Store embeddings and their metadata
|
||||||
|
- Vector search
|
||||||
|
- Full-text search
|
||||||
|
- Document storage
|
||||||
|
- Metadata filtering
|
||||||
|
- Multi-modal retrieval
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Chrome in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use chroma.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install chroma using pip:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install chromadb
|
||||||
|
```
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
|
@ -1,7 +1,201 @@
|
||||||
---
|
---
|
||||||
description: '[Milvus](https://milvus'
|
description: |
|
||||||
sidebar_label: Milvus
|
[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
|
||||||
sidebar_position: 10
|
allows you to store and query vectors directly within a Milvus database.
|
||||||
|
That means you're not limited to storing vectors in memory or in a separate service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Easy to use
|
||||||
|
- Fully integrated with Llama Stack
|
||||||
|
- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Milvus in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use Milvus.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
If you want to use inline Milvus, you can install:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install pymilvus[milvus-lite]
|
||||||
|
```
|
||||||
|
|
||||||
|
If you want to use remote Milvus, you can install:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install pymilvus
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
In Llama Stack, Milvus can be configured in two ways:
|
||||||
|
- **Inline (Local) Configuration** - Uses Milvus-Lite for local storage
|
||||||
|
- **Remote Configuration** - Connects to a remote Milvus server
|
||||||
|
|
||||||
|
### Inline (Local) Configuration
|
||||||
|
|
||||||
|
The simplest method is local configuration, which requires setting `db_path`, a path for locally storing Milvus-Lite files:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
vector_io:
|
||||||
|
- provider_id: milvus
|
||||||
|
provider_type: inline::milvus
|
||||||
|
config:
|
||||||
|
db_path: ~/.llama/distributions/together/milvus_store.db
|
||||||
|
```
|
||||||
|
|
||||||
|
### Remote Configuration
|
||||||
|
|
||||||
|
Remote configuration is suitable for larger data storage requirements:
|
||||||
|
|
||||||
|
#### Standard Remote Connection
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
vector_io:
|
||||||
|
- provider_id: milvus
|
||||||
|
provider_type: remote::milvus
|
||||||
|
config:
|
||||||
|
uri: "http://<host>:<port>"
|
||||||
|
token: "<user>:<password>"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### TLS-Enabled Remote Connection (One-way TLS)
|
||||||
|
|
||||||
|
For connections to Milvus instances with one-way TLS enabled:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
vector_io:
|
||||||
|
- provider_id: milvus
|
||||||
|
provider_type: remote::milvus
|
||||||
|
config:
|
||||||
|
uri: "https://<host>:<port>"
|
||||||
|
token: "<user>:<password>"
|
||||||
|
secure: True
|
||||||
|
server_pem_path: "/path/to/server.pem"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Mutual TLS (mTLS) Remote Connection
|
||||||
|
|
||||||
|
For connections to Milvus instances with mutual TLS (mTLS) enabled:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
vector_io:
|
||||||
|
- provider_id: milvus
|
||||||
|
provider_type: remote::milvus
|
||||||
|
config:
|
||||||
|
uri: "https://<host>:<port>"
|
||||||
|
token: "<user>:<password>"
|
||||||
|
secure: True
|
||||||
|
ca_pem_path: "/path/to/ca.pem"
|
||||||
|
client_pem_path: "/path/to/client.pem"
|
||||||
|
client_key_path: "/path/to/client.key"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Key Parameters for TLS Configuration
|
||||||
|
|
||||||
|
- **`secure`**: Enables TLS encryption when set to `true`. Defaults to `false`.
|
||||||
|
- **`server_pem_path`**: Path to the **server certificate** for verifying the server's identity (used in one-way TLS).
|
||||||
|
- **`ca_pem_path`**: Path to the **Certificate Authority (CA) certificate** for validating the server certificate (required in mTLS).
|
||||||
|
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
|
||||||
|
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
|
||||||
|
|
||||||
|
## Search Modes
|
||||||
|
|
||||||
|
Milvus supports three different search modes for both inline and remote configurations:
|
||||||
|
|
||||||
|
### Vector Search
|
||||||
|
Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Vector search example
|
||||||
|
search_response = client.vector_stores.search(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
query="What is machine learning?",
|
||||||
|
search_mode="vector",
|
||||||
|
max_num_results=5,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Keyword Search
|
||||||
|
Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Keyword search example
|
||||||
|
search_response = client.vector_stores.search(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
query="Python programming language",
|
||||||
|
search_mode="keyword",
|
||||||
|
max_num_results=5,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Hybrid Search
|
||||||
|
Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
|
||||||
|
|
||||||
|
#### Basic Hybrid Search
|
||||||
|
```python
|
||||||
|
# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
|
||||||
|
search_response = client.vector_stores.search(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
query="neural networks in Python",
|
||||||
|
search_mode="hybrid",
|
||||||
|
max_num_results=5,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
|
||||||
|
|
||||||
|
#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
|
||||||
|
RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Hybrid search with custom RRF parameters
|
||||||
|
search_response = client.vector_stores.search(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
query="neural networks in Python",
|
||||||
|
search_mode="hybrid",
|
||||||
|
max_num_results=5,
|
||||||
|
ranking_options={
|
||||||
|
"ranker": {
|
||||||
|
"type": "rrf",
|
||||||
|
"impact_factor": 100.0, # Higher values give more weight to top-ranked results
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Hybrid Search with Weighted Ranker
|
||||||
|
Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Hybrid search with weighted ranker
|
||||||
|
search_response = client.vector_stores.search(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
query="neural networks in Python",
|
||||||
|
search_mode="hybrid",
|
||||||
|
max_num_results=5,
|
||||||
|
ranking_options={
|
||||||
|
"ranker": {
|
||||||
|
"type": "weighted",
|
||||||
|
"alpha": 0.7, # 70% vector search, 30% keyword search
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
|
||||||
|
|
||||||
|
For more details on TLS configuration, refer to the [TLS setup guide](https://milvus.io/docs/tls.md).
|
||||||
|
sidebar_label: Remote - Milvus
|
||||||
title: remote::milvus
|
title: remote::milvus
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -9,10 +203,39 @@ title: remote::milvus
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
|
[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
|
||||||
allows you to store and query vectors directly within a Milvus database.
|
allows you to store and query vectors directly within a Milvus database.
|
||||||
That means you're not limited to storing vectors in memory or in a separate service.
|
That means you're not limited to storing vectors in memory or in a separate service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Easy to use
|
||||||
|
- Fully integrated with Llama Stack
|
||||||
|
- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Milvus in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use Milvus.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
If you want to use inline Milvus, you can install:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install pymilvus[milvus-lite]
|
||||||
|
```
|
||||||
|
|
||||||
|
If you want to use remote Milvus, you can install:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install pymilvus
|
||||||
|
```
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
In Llama Stack, Milvus can be configured in two ways:
|
In Llama Stack, Milvus can be configured in two ways:
|
||||||
|
@ -86,6 +309,112 @@ vector_io:
|
||||||
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
|
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
|
||||||
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
|
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
|
||||||
|
|
||||||
|
## Search Modes
|
||||||
|
|
||||||
|
Milvus supports three different search modes for both inline and remote configurations:
|
||||||
|
|
||||||
|
### Vector Search
|
||||||
|
Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Vector search example
|
||||||
|
search_response = client.vector_stores.search(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
query="What is machine learning?",
|
||||||
|
search_mode="vector",
|
||||||
|
max_num_results=5,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Keyword Search
|
||||||
|
Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Keyword search example
|
||||||
|
search_response = client.vector_stores.search(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
query="Python programming language",
|
||||||
|
search_mode="keyword",
|
||||||
|
max_num_results=5,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Hybrid Search
|
||||||
|
Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
|
||||||
|
|
||||||
|
#### Basic Hybrid Search
|
||||||
|
```python
|
||||||
|
# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
|
||||||
|
search_response = client.vector_stores.search(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
query="neural networks in Python",
|
||||||
|
search_mode="hybrid",
|
||||||
|
max_num_results=5,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
|
||||||
|
|
||||||
|
#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
|
||||||
|
RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Hybrid search with custom RRF parameters
|
||||||
|
search_response = client.vector_stores.search(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
query="neural networks in Python",
|
||||||
|
search_mode="hybrid",
|
||||||
|
max_num_results=5,
|
||||||
|
ranking_options={
|
||||||
|
"ranker": {
|
||||||
|
"type": "rrf",
|
||||||
|
"impact_factor": 100.0, # Higher values give more weight to top-ranked results
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Hybrid Search with Weighted Ranker
|
||||||
|
Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Hybrid search with weighted ranker
|
||||||
|
search_response = client.vector_stores.search(
|
||||||
|
vector_store_id=vector_store.id,
|
||||||
|
query="neural networks in Python",
|
||||||
|
search_mode="hybrid",
|
||||||
|
max_num_results=5,
|
||||||
|
ranking_options={
|
||||||
|
"ranker": {
|
||||||
|
"type": "weighted",
|
||||||
|
"alpha": 0.7, # 70% vector search, 30% keyword search
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
|
||||||
|
|
||||||
|
For more details on TLS configuration, refer to the [TLS setup guide](https://milvus.io/docs/tls.md).
|
||||||
|
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
| Field | Type | Required | Default | Description |
|
||||||
|
|-------|------|----------|---------|-------------|
|
||||||
|
| `uri` | `<class 'str'>` | No | | The URI of the Milvus server |
|
||||||
|
| `token` | `str \| None` | No | | The token of the Milvus server |
|
||||||
|
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
|
||||||
|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
|
||||||
|
| `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
|
||||||
|
|
||||||
|
:::note
|
||||||
|
This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
|
||||||
|
:::
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|
|
@ -1,7 +1,105 @@
|
||||||
---
|
---
|
||||||
description: '[PGVector](https://github'
|
description: |
|
||||||
sidebar_label: Pgvector
|
[PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It
|
||||||
sidebar_position: 11
|
allows you to store and query vectors directly in memory.
|
||||||
|
That means you'll get fast and efficient vector retrieval.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Easy to use
|
||||||
|
- Fully integrated with Llama Stack
|
||||||
|
|
||||||
|
There are three implementations of search for PGVectoIndex available:
|
||||||
|
|
||||||
|
1. Vector Search:
|
||||||
|
- How it works:
|
||||||
|
- Uses PostgreSQL's vector extension (pgvector) to perform similarity search
|
||||||
|
- Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
|
||||||
|
- Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
|
||||||
|
|
||||||
|
-Characteristics:
|
||||||
|
- Semantic understanding - finds documents similar in meaning even if they don't share keywords
|
||||||
|
- Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
|
||||||
|
- Best for: Finding conceptually related content, handling synonyms, cross-language search
|
||||||
|
|
||||||
|
2. Keyword Search
|
||||||
|
- How it works:
|
||||||
|
- Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
|
||||||
|
- Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
|
||||||
|
- Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
|
||||||
|
|
||||||
|
- Characteristics:
|
||||||
|
- Lexical matching - finds exact keyword matches and variations
|
||||||
|
- Uses GIN (Generalized Inverted Index) for fast text search performance
|
||||||
|
- Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
|
||||||
|
- Best for: Exact term matching, proper names, technical terms, Boolean-style queries
|
||||||
|
|
||||||
|
3. Hybrid Search
|
||||||
|
- How it works:
|
||||||
|
- Combines both vector and keyword search results
|
||||||
|
- Runs both searches independently, then merges results using configurable reranking
|
||||||
|
|
||||||
|
- Two reranking strategies available:
|
||||||
|
- Reciprocal Rank Fusion (RRF) - (default: 60.0)
|
||||||
|
- Weighted Average - (default: 0.5)
|
||||||
|
|
||||||
|
- Characteristics:
|
||||||
|
- Best of both worlds: semantic understanding + exact matching
|
||||||
|
- Documents appearing in both searches get boosted scores
|
||||||
|
- Configurable balance between semantic and lexical matching
|
||||||
|
- Best for: General-purpose search where you want both precision and recall
|
||||||
|
|
||||||
|
4. Database Schema
|
||||||
|
The PGVector implementation stores data optimized for all three search types:
|
||||||
|
CREATE TABLE vector_store_xxx (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
document JSONB, -- Original document
|
||||||
|
embedding vector(dimension), -- For vector search
|
||||||
|
content_text TEXT, -- Raw text content
|
||||||
|
tokenized_content TSVECTOR -- For keyword search
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Indexes for performance
|
||||||
|
CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content); -- Keyword search
|
||||||
|
-- Vector index created automatically by pgvector
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use PGVector in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## This is an example how you can set up your environment for using PGVector
|
||||||
|
|
||||||
|
1. Export env vars:
|
||||||
|
```bash
|
||||||
|
export ENABLE_PGVECTOR=true
|
||||||
|
export PGVECTOR_HOST=localhost
|
||||||
|
export PGVECTOR_PORT=5432
|
||||||
|
export PGVECTOR_DB=llamastack
|
||||||
|
export PGVECTOR_USER=llamastack
|
||||||
|
export PGVECTOR_PASSWORD=llamastack
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Create DB:
|
||||||
|
```bash
|
||||||
|
psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
|
||||||
|
psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
|
||||||
|
psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install PGVector using docker:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker pull pgvector/pgvector:pg17
|
||||||
|
```
|
||||||
|
## Documentation
|
||||||
|
See [PGVector's documentation](https://github.com/pgvector/pgvector) for more details about PGVector in general.
|
||||||
|
sidebar_label: Remote - Pgvector
|
||||||
title: remote::pgvector
|
title: remote::pgvector
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -9,10 +107,108 @@ title: remote::pgvector
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
[PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It
|
[PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It
|
||||||
allows you to store and query vectors directly in memory.
|
allows you to store and query vectors directly in memory.
|
||||||
That means you'll get fast and efficient vector retrieval.
|
That means you'll get fast and efficient vector retrieval.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Easy to use
|
||||||
|
- Fully integrated with Llama Stack
|
||||||
|
|
||||||
|
There are three implementations of search for PGVectoIndex available:
|
||||||
|
|
||||||
|
1. Vector Search:
|
||||||
|
- How it works:
|
||||||
|
- Uses PostgreSQL's vector extension (pgvector) to perform similarity search
|
||||||
|
- Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
|
||||||
|
- Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
|
||||||
|
|
||||||
|
-Characteristics:
|
||||||
|
- Semantic understanding - finds documents similar in meaning even if they don't share keywords
|
||||||
|
- Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
|
||||||
|
- Best for: Finding conceptually related content, handling synonyms, cross-language search
|
||||||
|
|
||||||
|
2. Keyword Search
|
||||||
|
- How it works:
|
||||||
|
- Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
|
||||||
|
- Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
|
||||||
|
- Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
|
||||||
|
|
||||||
|
- Characteristics:
|
||||||
|
- Lexical matching - finds exact keyword matches and variations
|
||||||
|
- Uses GIN (Generalized Inverted Index) for fast text search performance
|
||||||
|
- Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
|
||||||
|
- Best for: Exact term matching, proper names, technical terms, Boolean-style queries
|
||||||
|
|
||||||
|
3. Hybrid Search
|
||||||
|
- How it works:
|
||||||
|
- Combines both vector and keyword search results
|
||||||
|
- Runs both searches independently, then merges results using configurable reranking
|
||||||
|
|
||||||
|
- Two reranking strategies available:
|
||||||
|
- Reciprocal Rank Fusion (RRF) - (default: 60.0)
|
||||||
|
- Weighted Average - (default: 0.5)
|
||||||
|
|
||||||
|
- Characteristics:
|
||||||
|
- Best of both worlds: semantic understanding + exact matching
|
||||||
|
- Documents appearing in both searches get boosted scores
|
||||||
|
- Configurable balance between semantic and lexical matching
|
||||||
|
- Best for: General-purpose search where you want both precision and recall
|
||||||
|
|
||||||
|
4. Database Schema
|
||||||
|
The PGVector implementation stores data optimized for all three search types:
|
||||||
|
CREATE TABLE vector_store_xxx (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
document JSONB, -- Original document
|
||||||
|
embedding vector(dimension), -- For vector search
|
||||||
|
content_text TEXT, -- Raw text content
|
||||||
|
tokenized_content TSVECTOR -- For keyword search
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Indexes for performance
|
||||||
|
CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content); -- Keyword search
|
||||||
|
-- Vector index created automatically by pgvector
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use PGVector in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## This is an example how you can set up your environment for using PGVector
|
||||||
|
|
||||||
|
1. Export env vars:
|
||||||
|
```bash
|
||||||
|
export ENABLE_PGVECTOR=true
|
||||||
|
export PGVECTOR_HOST=localhost
|
||||||
|
export PGVECTOR_PORT=5432
|
||||||
|
export PGVECTOR_DB=llamastack
|
||||||
|
export PGVECTOR_USER=llamastack
|
||||||
|
export PGVECTOR_PASSWORD=llamastack
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Create DB:
|
||||||
|
```bash
|
||||||
|
psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
|
||||||
|
psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
|
||||||
|
psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
You can install PGVector using docker:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker pull pgvector/pgvector:pg17
|
||||||
|
```
|
||||||
|
## Documentation
|
||||||
|
See [PGVector's documentation](https://github.com/pgvector/pgvector) for more details about PGVector in general.
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
---
|
---
|
||||||
description: Please refer to the inline provider documentation
|
description: "Please refer to the inline provider documentation."
|
||||||
sidebar_label: Qdrant
|
sidebar_label: Remote - Qdrant
|
||||||
sidebar_position: 12
|
|
||||||
title: remote::qdrant
|
title: remote::qdrant
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -9,8 +8,10 @@ title: remote::qdrant
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
Please refer to the inline provider documentation.
|
Please refer to the inline provider documentation.
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
|
@ -1,7 +1,35 @@
|
||||||
---
|
---
|
||||||
description: '[Weaviate](https://weaviate'
|
description: |
|
||||||
sidebar_label: Weaviate
|
[Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack.
|
||||||
sidebar_position: 13
|
It allows you to store and query vectors directly within a Weaviate database.
|
||||||
|
That means you're not limited to storing vectors in memory or in a separate service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
Weaviate supports:
|
||||||
|
- Store embeddings and their metadata
|
||||||
|
- Vector search
|
||||||
|
- Full-text search
|
||||||
|
- Hybrid search
|
||||||
|
- Document storage
|
||||||
|
- Metadata filtering
|
||||||
|
- Multi-modal retrieval
|
||||||
|
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Weaviate in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use chroma.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
To install Weaviate see the [Weaviate quickstart documentation](https://weaviate.io/developers/weaviate/quickstart).
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general.
|
||||||
|
sidebar_label: Remote - Weaviate
|
||||||
title: remote::weaviate
|
title: remote::weaviate
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -9,10 +37,38 @@ title: remote::weaviate
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
|
|
||||||
[Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack.
|
[Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack.
|
||||||
It allows you to store and query vectors directly within a Weaviate database.
|
It allows you to store and query vectors directly within a Weaviate database.
|
||||||
That means you're not limited to storing vectors in memory or in a separate service.
|
That means you're not limited to storing vectors in memory or in a separate service.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
Weaviate supports:
|
||||||
|
- Store embeddings and their metadata
|
||||||
|
- Vector search
|
||||||
|
- Full-text search
|
||||||
|
- Hybrid search
|
||||||
|
- Document storage
|
||||||
|
- Metadata filtering
|
||||||
|
- Multi-modal retrieval
|
||||||
|
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use Weaviate in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
|
1. Install the necessary dependencies.
|
||||||
|
2. Configure your Llama Stack project to use chroma.
|
||||||
|
3. Start storing and querying vectors.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
To install Weaviate see the [Weaviate quickstart documentation](https://weaviate.io/developers/weaviate/quickstart).
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general.
|
||||||
|
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|
|
|
@ -0,0 +1,125 @@
|
||||||
|
---
|
||||||
|
orphan: true
|
||||||
|
---
|
||||||
|
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
|
||||||
|
# Meta Reference GPU Distribution
|
||||||
|
|
||||||
|
```{toctree}
|
||||||
|
:maxdepth: 2
|
||||||
|
:hidden:
|
||||||
|
|
||||||
|
self
|
||||||
|
```
|
||||||
|
|
||||||
|
The `llamastack/distribution-meta-reference-gpu` distribution consists of the following provider configurations:
|
||||||
|
|
||||||
|
| API | Provider(s) |
|
||||||
|
|-----|-------------|
|
||||||
|
| agents | `inline::meta-reference` |
|
||||||
|
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||||
|
| eval | `inline::meta-reference` |
|
||||||
|
| inference | `inline::meta-reference` |
|
||||||
|
| safety | `inline::llama-guard` |
|
||||||
|
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||||
|
| telemetry | `inline::meta-reference` |
|
||||||
|
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||||
|
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||||
|
|
||||||
|
|
||||||
|
Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
The following environment variables can be configured:
|
||||||
|
|
||||||
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
|
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
||||||
|
- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
|
||||||
|
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
|
||||||
|
- `SAFETY_CHECKPOINT_DIR`: Directory containing the Llama-Guard model checkpoint (default: `null`)
|
||||||
|
|
||||||
|
|
||||||
|
## Prerequisite: Downloading Models
|
||||||
|
|
||||||
|
Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
|
||||||
|
|
||||||
|
```
|
||||||
|
$ llama model list --downloaded
|
||||||
|
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
|
||||||
|
┃ Model ┃ Size ┃ Modified Time ┃
|
||||||
|
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
|
||||||
|
│ Llama3.2-1B-Instruct:int4-qlora-eo8 │ 1.53 GB │ 2025-02-26 11:22:28 │
|
||||||
|
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||||
|
│ Llama3.2-1B │ 2.31 GB │ 2025-02-18 21:48:52 │
|
||||||
|
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||||
|
│ Prompt-Guard-86M │ 0.02 GB │ 2025-02-26 11:29:28 │
|
||||||
|
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||||
|
│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB │ 2025-02-26 11:37:41 │
|
||||||
|
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||||
|
│ Llama3.2-3B │ 5.99 GB │ 2025-02-18 21:51:26 │
|
||||||
|
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||||
|
│ Llama3.1-8B │ 14.97 GB │ 2025-02-16 10:36:37 │
|
||||||
|
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||||
|
│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB │ 2025-02-26 11:35:02 │
|
||||||
|
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||||
|
│ Llama-Guard-3-1B │ 2.80 GB │ 2025-02-26 11:20:46 │
|
||||||
|
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||||
|
│ Llama-Guard-3-1B:int4 │ 0.43 GB │ 2025-02-26 11:33:33 │
|
||||||
|
└─────────────────────────────────────────┴──────────┴─────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running the Distribution
|
||||||
|
|
||||||
|
You can do this via venv or Docker which has a pre-built image.
|
||||||
|
|
||||||
|
### Via Docker
|
||||||
|
|
||||||
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
LLAMA_STACK_PORT=8321
|
||||||
|
docker run \
|
||||||
|
-it \
|
||||||
|
--pull always \
|
||||||
|
--gpu all \
|
||||||
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
|
-v ~/.llama:/root/.llama \
|
||||||
|
llamastack/distribution-meta-reference-gpu \
|
||||||
|
--port $LLAMA_STACK_PORT \
|
||||||
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run \
|
||||||
|
-it \
|
||||||
|
--pull always \
|
||||||
|
--gpu all \
|
||||||
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
|
-v ~/.llama:/root/.llama \
|
||||||
|
llamastack/distribution-meta-reference-gpu \
|
||||||
|
--port $LLAMA_STACK_PORT \
|
||||||
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
```
|
||||||
|
|
||||||
|
### Via venv
|
||||||
|
|
||||||
|
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama stack build --distro meta-reference-gpu --image-type venv
|
||||||
|
llama stack run distributions/meta-reference-gpu/run.yaml \
|
||||||
|
--port 8321 \
|
||||||
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
|
```
|
||||||
|
|
||||||
|
If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
|
||||||
|
--port 8321 \
|
||||||
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
```
|
171
docs/source/distributions/self_hosted_distro/nvidia.md
Normal file
171
docs/source/distributions/self_hosted_distro/nvidia.md
Normal file
|
@ -0,0 +1,171 @@
|
||||||
|
---
|
||||||
|
orphan: true
|
||||||
|
---
|
||||||
|
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
|
||||||
|
# NVIDIA Distribution
|
||||||
|
|
||||||
|
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
|
||||||
|
|
||||||
|
| API | Provider(s) |
|
||||||
|
|-----|-------------|
|
||||||
|
| agents | `inline::meta-reference` |
|
||||||
|
| datasetio | `inline::localfs`, `remote::nvidia` |
|
||||||
|
| eval | `remote::nvidia` |
|
||||||
|
| files | `inline::localfs` |
|
||||||
|
| inference | `remote::nvidia` |
|
||||||
|
| post_training | `remote::nvidia` |
|
||||||
|
| safety | `remote::nvidia` |
|
||||||
|
| scoring | `inline::basic` |
|
||||||
|
| telemetry | `inline::meta-reference` |
|
||||||
|
| tool_runtime | `inline::rag-runtime` |
|
||||||
|
| vector_io | `inline::faiss` |
|
||||||
|
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
The following environment variables can be configured:
|
||||||
|
|
||||||
|
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
||||||
|
- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
|
||||||
|
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
|
||||||
|
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
|
||||||
|
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
||||||
|
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
||||||
|
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
||||||
|
- `NVIDIA_GUARDRAILS_CONFIG_ID`: NVIDIA Guardrail Configuration ID (default: `self-check`)
|
||||||
|
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
|
||||||
|
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
||||||
|
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
||||||
|
|
||||||
|
### Models
|
||||||
|
|
||||||
|
The following models are available by default:
|
||||||
|
|
||||||
|
- `meta/llama3-8b-instruct `
|
||||||
|
- `meta/llama3-70b-instruct `
|
||||||
|
- `meta/llama-3.1-8b-instruct `
|
||||||
|
- `meta/llama-3.1-70b-instruct `
|
||||||
|
- `meta/llama-3.1-405b-instruct `
|
||||||
|
- `meta/llama-3.2-1b-instruct `
|
||||||
|
- `meta/llama-3.2-3b-instruct `
|
||||||
|
- `meta/llama-3.2-11b-vision-instruct `
|
||||||
|
- `meta/llama-3.2-90b-vision-instruct `
|
||||||
|
- `meta/llama-3.3-70b-instruct `
|
||||||
|
- `nvidia/vila `
|
||||||
|
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
||||||
|
- `nvidia/nv-embedqa-e5-v5 `
|
||||||
|
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
||||||
|
- `snowflake/arctic-embed-l `
|
||||||
|
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
### NVIDIA API Keys
|
||||||
|
|
||||||
|
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
||||||
|
|
||||||
|
### Deploy NeMo Microservices Platform
|
||||||
|
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||||
|
|
||||||
|
## Supported Services
|
||||||
|
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
||||||
|
|
||||||
|
### Inference: NVIDIA NIM
|
||||||
|
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
||||||
|
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
||||||
|
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
||||||
|
|
||||||
|
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
||||||
|
|
||||||
|
### Datasetio API: NeMo Data Store
|
||||||
|
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
||||||
|
|
||||||
|
See the {repopath}`NVIDIA Datasetio docs::llama_stack/providers/remote/datasetio/nvidia/README.md` for supported features and example usage.
|
||||||
|
|
||||||
|
### Eval API: NeMo Evaluator
|
||||||
|
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||||
|
|
||||||
|
See the {repopath}`NVIDIA Eval docs::llama_stack/providers/remote/eval/nvidia/README.md` for supported features and example usage.
|
||||||
|
|
||||||
|
### Post-Training API: NeMo Customizer
|
||||||
|
The NeMo Customizer microservice supports fine-tuning models. You can reference {repopath}`this list of supported models::llama_stack/providers/remote/post_training/nvidia/models.py` that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||||
|
|
||||||
|
See the {repopath}`NVIDIA Post-Training docs::llama_stack/providers/remote/post_training/nvidia/README.md` for supported features and example usage.
|
||||||
|
|
||||||
|
### Safety API: NeMo Guardrails
|
||||||
|
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||||
|
|
||||||
|
See the {repopath}`NVIDIA Safety docs::llama_stack/providers/remote/safety/nvidia/README.md` for supported features and example usage.
|
||||||
|
|
||||||
|
## Deploying models
|
||||||
|
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
||||||
|
|
||||||
|
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
||||||
|
```sh
|
||||||
|
# URL to NeMo NIM Proxy service
|
||||||
|
export NEMO_URL="http://nemo.test"
|
||||||
|
|
||||||
|
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
||||||
|
-H 'accept: application/json' \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{
|
||||||
|
"name": "llama-3.2-1b-instruct",
|
||||||
|
"namespace": "meta",
|
||||||
|
"config": {
|
||||||
|
"model": "meta/llama-3.2-1b-instruct",
|
||||||
|
"nim_deployment": {
|
||||||
|
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
||||||
|
"image_tag": "1.8.3",
|
||||||
|
"pvc_size": "25Gi",
|
||||||
|
"gpu": 1,
|
||||||
|
"additional_envs": {
|
||||||
|
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
|
||||||
|
|
||||||
|
You can also remove a deployed NIM to free up GPU resources, if needed.
|
||||||
|
```sh
|
||||||
|
export NEMO_URL="http://nemo.test"
|
||||||
|
|
||||||
|
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running Llama Stack with NVIDIA
|
||||||
|
|
||||||
|
You can do this via venv (build code), or Docker which has a pre-built image.
|
||||||
|
|
||||||
|
### Via Docker
|
||||||
|
|
||||||
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
LLAMA_STACK_PORT=8321
|
||||||
|
docker run \
|
||||||
|
-it \
|
||||||
|
--pull always \
|
||||||
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
|
-v ./run.yaml:/root/my-run.yaml \
|
||||||
|
llamastack/distribution-nvidia \
|
||||||
|
--config /root/my-run.yaml \
|
||||||
|
--port $LLAMA_STACK_PORT \
|
||||||
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||||
|
```
|
||||||
|
|
||||||
|
### Via venv
|
||||||
|
|
||||||
|
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
llama stack build --distro nvidia --image-type venv
|
||||||
|
llama stack run ./run.yaml \
|
||||||
|
--port 8321 \
|
||||||
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||||
|
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example Notebooks
|
||||||
|
For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in {repopath}`docs/notebooks/nvidia`.
|
|
@ -10,11 +10,11 @@ import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from llama_stack.core.distribution import get_provider_registry
|
|
||||||
|
|
||||||
from pydantic_core import PydanticUndefined
|
from pydantic_core import PydanticUndefined
|
||||||
from rich.progress import Progress, SpinnerColumn, TextColumn
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||||
|
|
||||||
|
from llama_stack.core.distribution import get_provider_registry
|
||||||
|
|
||||||
REPO_ROOT = Path(__file__).parent.parent
|
REPO_ROOT = Path(__file__).parent.parent
|
||||||
|
|
||||||
|
|
||||||
|
@ -22,9 +22,7 @@ def get_api_docstring(api_name: str) -> str | None:
|
||||||
"""Extract docstring from the API protocol class."""
|
"""Extract docstring from the API protocol class."""
|
||||||
try:
|
try:
|
||||||
# Import the API module dynamically
|
# Import the API module dynamically
|
||||||
api_module = __import__(
|
api_module = __import__(f"llama_stack.apis.{api_name}", fromlist=[api_name.title()])
|
||||||
f"llama_stack.apis.{api_name}", fromlist=[api_name.title()]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get the main protocol class (usually capitalized API name)
|
# Get the main protocol class (usually capitalized API name)
|
||||||
protocol_class_name = api_name.title()
|
protocol_class_name = api_name.title()
|
||||||
|
@ -72,10 +70,7 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
|
||||||
model_config = config_class.model_config
|
model_config = config_class.model_config
|
||||||
if hasattr(model_config, "extra") and model_config.extra == "allow":
|
if hasattr(model_config, "extra") and model_config.extra == "allow":
|
||||||
accepts_extra_config = True
|
accepts_extra_config = True
|
||||||
elif (
|
elif isinstance(model_config, dict) and model_config.get("extra") == "allow":
|
||||||
isinstance(model_config, dict)
|
|
||||||
and model_config.get("extra") == "allow"
|
|
||||||
):
|
|
||||||
accepts_extra_config = True
|
accepts_extra_config = True
|
||||||
|
|
||||||
fields_info = {}
|
fields_info = {}
|
||||||
|
@ -84,19 +79,9 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
|
||||||
field_type = str(field.annotation) if field.annotation else "Any"
|
field_type = str(field.annotation) if field.annotation else "Any"
|
||||||
|
|
||||||
# this string replace is ridiculous
|
# this string replace is ridiculous
|
||||||
field_type = (
|
field_type = field_type.replace("typing.", "").replace("Optional[", "").replace("]", "")
|
||||||
field_type.replace("typing.", "")
|
field_type = field_type.replace("Annotated[", "").replace("FieldInfo(", "").replace(")", "")
|
||||||
.replace("Optional[", "")
|
field_type = field_type.replace("llama_stack.apis.inference.inference.", "")
|
||||||
.replace("]", "")
|
|
||||||
)
|
|
||||||
field_type = (
|
|
||||||
field_type.replace("Annotated[", "")
|
|
||||||
.replace("FieldInfo(", "")
|
|
||||||
.replace(")", "")
|
|
||||||
)
|
|
||||||
field_type = field_type.replace(
|
|
||||||
"llama_stack.apis.inference.inference.", ""
|
|
||||||
)
|
|
||||||
field_type = field_type.replace("llama_stack.providers.", "")
|
field_type = field_type.replace("llama_stack.providers.", "")
|
||||||
|
|
||||||
default_value = field.default
|
default_value = field.default
|
||||||
|
@ -106,10 +91,7 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
|
||||||
# HACK ALERT:
|
# HACK ALERT:
|
||||||
# If the default value contains a path that looks like it came from RUNTIME_BASE_DIR,
|
# If the default value contains a path that looks like it came from RUNTIME_BASE_DIR,
|
||||||
# replace it with a generic ~/.llama/ path for documentation
|
# replace it with a generic ~/.llama/ path for documentation
|
||||||
if (
|
if isinstance(default_value, str) and "/.llama/" in default_value:
|
||||||
isinstance(default_value, str)
|
|
||||||
and "/.llama/" in default_value
|
|
||||||
):
|
|
||||||
if ".llama/" in default_value:
|
if ".llama/" in default_value:
|
||||||
path_part = default_value.split(".llama/")[-1]
|
path_part = default_value.split(".llama/")[-1]
|
||||||
default_value = f"~/.llama/{path_part}"
|
default_value = f"~/.llama/{path_part}"
|
||||||
|
@ -135,11 +117,7 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
|
||||||
lines = source.split("\n")
|
lines = source.split("\n")
|
||||||
|
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
if (
|
if "model_config" in line and "ConfigDict" in line and 'extra="allow"' in line:
|
||||||
"model_config" in line
|
|
||||||
and "ConfigDict" in line
|
|
||||||
and 'extra="allow"' in line
|
|
||||||
):
|
|
||||||
comments = []
|
comments = []
|
||||||
for j in range(i - 1, -1, -1):
|
for j in range(i - 1, -1, -1):
|
||||||
stripped = lines[j].strip()
|
stripped = lines[j].strip()
|
||||||
|
@ -204,9 +182,7 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
|
||||||
# Create sidebar label (clean up provider_type for display)
|
# Create sidebar label (clean up provider_type for display)
|
||||||
sidebar_label = provider_type.replace("::", " - ").replace("_", " ")
|
sidebar_label = provider_type.replace("::", " - ").replace("_", " ")
|
||||||
if sidebar_label.startswith("inline - "):
|
if sidebar_label.startswith("inline - "):
|
||||||
sidebar_label = sidebar_label[
|
sidebar_label = sidebar_label[9:].title() # Remove "inline - " prefix and title case
|
||||||
9:
|
|
||||||
].title() # Remove "inline - " prefix and title case
|
|
||||||
else:
|
else:
|
||||||
sidebar_label = sidebar_label.title()
|
sidebar_label = sidebar_label.title()
|
||||||
|
|
||||||
|
@ -219,7 +195,8 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
|
||||||
if "\n" in description.strip():
|
if "\n" in description.strip():
|
||||||
md_lines.append("description: |")
|
md_lines.append("description: |")
|
||||||
for line in description.strip().split("\n"):
|
for line in description.strip().split("\n"):
|
||||||
md_lines.append(f" {line}")
|
# Avoid trailing whitespace by only adding spaces to non-empty lines
|
||||||
|
md_lines.append(f" {line}" if line.strip() else "")
|
||||||
else:
|
else:
|
||||||
# For single line descriptions, format properly for YAML
|
# For single line descriptions, format properly for YAML
|
||||||
clean_desc = description.strip().replace('"', '\\"')
|
clean_desc = description.strip().replace('"', '\\"')
|
||||||
|
@ -248,14 +225,10 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
|
||||||
for field_name, field_info in config_info["fields"].items():
|
for field_name, field_info in config_info["fields"].items():
|
||||||
field_type = field_info["type"].replace("|", "\\|")
|
field_type = field_info["type"].replace("|", "\\|")
|
||||||
required = "Yes" if field_info["required"] else "No"
|
required = "Yes" if field_info["required"] else "No"
|
||||||
default = (
|
default = str(field_info["default"]) if field_info["default"] is not None else ""
|
||||||
str(field_info["default"]) if field_info["default"] is not None else ""
|
|
||||||
)
|
|
||||||
description_text = field_info["description"] or ""
|
description_text = field_info["description"] or ""
|
||||||
|
|
||||||
md_lines.append(
|
md_lines.append(f"| `{field_name}` | `{field_type}` | {required} | {default} | {description_text} |")
|
||||||
f"| `{field_name}` | `{field_type}` | {required} | {default} | {description_text} |"
|
|
||||||
)
|
|
||||||
|
|
||||||
md_lines.append("")
|
md_lines.append("")
|
||||||
|
|
||||||
|
@ -297,22 +270,16 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
sample_config_dict = convert_pydantic_to_dict(sample_config)
|
sample_config_dict = convert_pydantic_to_dict(sample_config)
|
||||||
md_lines.append(
|
# Strip trailing newlines from yaml.dump to prevent extra blank lines
|
||||||
yaml.dump(
|
yaml_output = yaml.dump(sample_config_dict, default_flow_style=False, sort_keys=False).rstrip()
|
||||||
sample_config_dict, default_flow_style=False, sort_keys=False
|
md_lines.append(yaml_output)
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
md_lines.append("# No sample configuration available.")
|
md_lines.append("# No sample configuration available.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
md_lines.append(f"# Error generating sample config: {str(e)}")
|
md_lines.append(f"# Error generating sample config: {str(e)}")
|
||||||
md_lines.append("```")
|
md_lines.append("```")
|
||||||
md_lines.append("")
|
|
||||||
|
|
||||||
if (
|
if hasattr(provider_spec, "deprecation_warning") and provider_spec.deprecation_warning:
|
||||||
hasattr(provider_spec, "deprecation_warning")
|
|
||||||
and provider_spec.deprecation_warning
|
|
||||||
):
|
|
||||||
md_lines.append("## Deprecation Notice")
|
md_lines.append("## Deprecation Notice")
|
||||||
md_lines.append("")
|
md_lines.append("")
|
||||||
md_lines.append(":::warning")
|
md_lines.append(":::warning")
|
||||||
|
@ -330,9 +297,7 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
|
||||||
return "\n".join(md_lines) + "\n"
|
return "\n".join(md_lines) + "\n"
|
||||||
|
|
||||||
|
|
||||||
def generate_index_docs(
|
def generate_index_docs(api_name: str, api_docstring: str | None, provider_entries: list) -> str:
|
||||||
api_name: str, api_docstring: str | None, provider_entries: list
|
|
||||||
) -> str:
|
|
||||||
"""Generate MDX documentation for the index file."""
|
"""Generate MDX documentation for the index file."""
|
||||||
# Create sidebar label for the API
|
# Create sidebar label for the API
|
||||||
sidebar_label = api_name.replace("_", " ").title()
|
sidebar_label = api_name.replace("_", " ").title()
|
||||||
|
@ -360,9 +325,7 @@ def generate_index_docs(
|
||||||
md_lines.append(f"{cleaned_docstring}")
|
md_lines.append(f"{cleaned_docstring}")
|
||||||
md_lines.append("")
|
md_lines.append("")
|
||||||
|
|
||||||
md_lines.append(
|
md_lines.append(f"This section contains documentation for all available providers for the **{api_name}** API.")
|
||||||
f"This section contains documentation for all available providers for the **{api_name}** API."
|
|
||||||
)
|
|
||||||
md_lines.append("")
|
md_lines.append("")
|
||||||
|
|
||||||
md_lines.append("## Providers")
|
md_lines.append("## Providers")
|
||||||
|
@ -373,9 +336,8 @@ def generate_index_docs(
|
||||||
provider_name = entry["display_name"]
|
provider_name = entry["display_name"]
|
||||||
filename = entry["filename"]
|
filename = entry["filename"]
|
||||||
md_lines.append(f"- [{provider_name}](./{filename})")
|
md_lines.append(f"- [{provider_name}](./{filename})")
|
||||||
md_lines.append("")
|
|
||||||
|
|
||||||
return "\n".join(md_lines)
|
return "\n".join(md_lines) + "\n"
|
||||||
|
|
||||||
|
|
||||||
def process_provider_registry(progress, change_tracker: ChangedPathTracker) -> None:
|
def process_provider_registry(progress, change_tracker: ChangedPathTracker) -> None:
|
||||||
|
@ -411,14 +373,10 @@ def process_provider_registry(progress, change_tracker: ChangedPathTracker) -> N
|
||||||
else:
|
else:
|
||||||
display_name = display_name.title()
|
display_name = display_name.title()
|
||||||
|
|
||||||
provider_entries.append(
|
provider_entries.append({"filename": filename, "display_name": display_name})
|
||||||
{"filename": filename, "display_name": display_name}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate index file with frontmatter
|
# Generate index file with frontmatter
|
||||||
index_content = generate_index_docs(
|
index_content = generate_index_docs(api_name, api_docstring, provider_entries)
|
||||||
api_name, api_docstring, provider_entries
|
|
||||||
)
|
|
||||||
index_file = doc_output_dir / "index.mdx"
|
index_file = doc_output_dir / "index.mdx"
|
||||||
index_file.write_text(index_content)
|
index_file.write_text(index_content)
|
||||||
change_tracker.add_paths(index_file)
|
change_tracker.add_paths(index_file)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue