mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-03 19:57:35 +00:00
provider codegen fixes
This commit is contained in:
parent
29d84570c3
commit
04bf9e6f80
80 changed files with 1875 additions and 433 deletions
|
@ -1,7 +1,13 @@
|
|||
---
|
||||
description: Available providers for the agents API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
description: "Agents API for creating and interacting with agentic systems.
|
||||
|
||||
Main functionalities provided by this API:
|
||||
- Create agents with specific instructions and ability to use tools.
|
||||
- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".
|
||||
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
|
||||
- Agents can be provided with various shields (see the Safety API for more details).
|
||||
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details."
|
||||
sidebar_label: Agents
|
||||
title: Agents
|
||||
---
|
||||
|
||||
|
@ -22,4 +28,4 @@ This section contains documentation for all available providers for the **agents
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Meta Reference](./inline_meta-reference)** - Inline provider
|
||||
- [Meta-Reference](./inline_meta-reference)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Meta's reference implementation of an agent system that can use tools,
|
||||
access vector databases, and perform complex reasoning tasks
|
||||
sidebar_label: Meta Reference
|
||||
sidebar_position: 2
|
||||
description: "Meta's reference implementation of an agent system that can use tools, access vector databases, and perform complex reasoning tasks."
|
||||
sidebar_label: Meta-Reference
|
||||
title: inline::meta-reference
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,15 @@
|
|||
---
|
||||
description: Available providers for the batches API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
description: "The Batches API enables efficient processing of multiple requests in a single operation,
|
||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||
cost-effective inference at scale.
|
||||
|
||||
The API is designed to allow use of openai client libraries for seamless integration.
|
||||
|
||||
This API provides the following extensions:
|
||||
- idempotent batch creation
|
||||
|
||||
Note: This API is currently under active development and may undergo changes."
|
||||
sidebar_label: Batches
|
||||
title: Batches
|
||||
---
|
||||
|
||||
|
@ -24,4 +32,4 @@ This section contains documentation for all available providers for the **batche
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Reference](./inline_reference)** - Inline provider
|
||||
- [Reference](./inline_reference)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Reference implementation of batches API with KVStore persistence
|
||||
description: "Reference implementation of batches API with KVStore persistence."
|
||||
sidebar_label: Reference
|
||||
sidebar_position: 2
|
||||
title: inline::reference
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
---
|
||||
description: Available providers for the datasetio API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
sidebar_label: Datasetio
|
||||
title: Datasetio
|
||||
---
|
||||
|
||||
|
@ -13,6 +11,6 @@ This section contains documentation for all available providers for the **datase
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Localfs](./inline_localfs)** - Inline provider
|
||||
- **[Huggingface](./remote_huggingface)** - Remote provider
|
||||
- **[Nvidia](./remote_nvidia)** - Remote provider
|
||||
- [Localfs](./inline_localfs)
|
||||
- [Remote - Huggingface](./remote_huggingface)
|
||||
- [Remote - Nvidia](./remote_nvidia)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Local filesystem-based dataset I/O provider for reading and writing datasets
|
||||
to local storage
|
||||
description: "Local filesystem-based dataset I/O provider for reading and writing datasets to local storage."
|
||||
sidebar_label: Localfs
|
||||
sidebar_position: 2
|
||||
title: inline::localfs
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: HuggingFace datasets provider for accessing and managing datasets from
|
||||
the HuggingFace Hub
|
||||
sidebar_label: Huggingface
|
||||
sidebar_position: 3
|
||||
description: "HuggingFace datasets provider for accessing and managing datasets from the HuggingFace Hub."
|
||||
sidebar_label: Remote - Huggingface
|
||||
title: remote::huggingface
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data
|
||||
platform
|
||||
sidebar_label: Nvidia
|
||||
sidebar_position: 4
|
||||
description: "NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data platform."
|
||||
sidebar_label: Remote - Nvidia
|
||||
title: remote::nvidia
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Available providers for the eval API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
description: "Llama Stack Evaluation API for running evaluations on model and agent candidates."
|
||||
sidebar_label: Eval
|
||||
title: Eval
|
||||
---
|
||||
|
||||
|
@ -15,5 +14,5 @@ This section contains documentation for all available providers for the **eval**
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Meta Reference](./inline_meta-reference)** - Inline provider
|
||||
- **[Nvidia](./remote_nvidia)** - Remote provider
|
||||
- [Meta-Reference](./inline_meta-reference)
|
||||
- [Remote - Nvidia](./remote_nvidia)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Meta's reference implementation of evaluation tasks with support for
|
||||
multiple languages and evaluation metrics
|
||||
sidebar_label: Meta Reference
|
||||
sidebar_position: 2
|
||||
description: "Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics."
|
||||
sidebar_label: Meta-Reference
|
||||
title: inline::meta-reference
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's
|
||||
platform
|
||||
sidebar_label: Nvidia
|
||||
sidebar_position: 3
|
||||
description: "NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform."
|
||||
sidebar_label: Remote - Nvidia
|
||||
title: remote::nvidia
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
---
|
||||
description: Available providers for the files API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
sidebar_label: Files
|
||||
title: Files
|
||||
---
|
||||
|
||||
|
@ -13,5 +11,5 @@ This section contains documentation for all available providers for the **files*
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Localfs](./inline_localfs)** - Inline provider
|
||||
- **[S3](./remote_s3)** - Remote provider
|
||||
- [Localfs](./inline_localfs)
|
||||
- [Remote - S3](./remote_s3)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Local filesystem-based file storage provider for managing files and documents
|
||||
locally
|
||||
description: "Local filesystem-based file storage provider for managing files and documents locally."
|
||||
sidebar_label: Localfs
|
||||
sidebar_position: 2
|
||||
title: inline::localfs
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: AWS S3-based file storage provider for scalable cloud file management
|
||||
with metadata persistence
|
||||
sidebar_label: S3
|
||||
sidebar_position: 3
|
||||
description: "AWS S3-based file storage provider for scalable cloud file management with metadata persistence."
|
||||
sidebar_label: Remote - S3
|
||||
title: remote::s3
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
---
|
||||
description: Available providers for the inference API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
||||
|
||||
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
||||
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
|
||||
- Embedding models: these models generate embeddings to be used for semantic search."
|
||||
sidebar_label: Inference
|
||||
title: Inference
|
||||
---
|
||||
|
||||
|
@ -19,28 +22,27 @@ This section contains documentation for all available providers for the **infere
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Meta Reference](./inline_meta-reference)** - Inline provider
|
||||
- **[Sentence Transformers](./inline_sentence-transformers)** - Inline provider
|
||||
- **[Anthropic](./remote_anthropic)** - Remote provider
|
||||
- **[Azure](./remote_azure)** - Remote provider
|
||||
- **[Bedrock](./remote_bedrock)** - Remote provider
|
||||
- **[Cerebras](./remote_cerebras)** - Remote provider
|
||||
- **[Databricks](./remote_databricks)** - Remote provider
|
||||
- **[Fireworks](./remote_fireworks)** - Remote provider
|
||||
- **[Gemini](./remote_gemini)** - Remote provider
|
||||
- **[Groq](./remote_groq)** - Remote provider
|
||||
- **[Hugging Face Endpoint](./remote_hf_endpoint)** - Remote provider
|
||||
- **[Hugging Face Serverless](./remote_hf_serverless)** - Remote provider
|
||||
- **[Llama OpenAI Compatible](./remote_llama-openai-compat)** - Remote provider
|
||||
- **[Nvidia](./remote_nvidia)** - Remote provider
|
||||
- **[Ollama](./remote_ollama)** - Remote provider
|
||||
- **[Openai](./remote_openai)** - Remote provider
|
||||
- **[Passthrough](./remote_passthrough)** - Remote provider
|
||||
- **[Runpod](./remote_runpod)** - Remote provider
|
||||
- **[Sambanova](./remote_sambanova)** - Remote provider
|
||||
- **[SambaNova OpenAI Compatible](./remote_sambanova-openai-compat)** - Remote provider
|
||||
- **[Tgi](./remote_tgi)** - Remote provider
|
||||
- **[Together](./remote_together)** - Remote provider
|
||||
- **[Vertexai](./remote_vertexai)** - Remote provider
|
||||
- **[Vllm](./remote_vllm)** - Remote provider
|
||||
- **[Watsonx](./remote_watsonx)** - Remote provider
|
||||
- [Meta-Reference](./inline_meta-reference)
|
||||
- [Sentence-Transformers](./inline_sentence-transformers)
|
||||
- [Remote - Anthropic](./remote_anthropic)
|
||||
- [Remote - Azure](./remote_azure)
|
||||
- [Remote - Bedrock](./remote_bedrock)
|
||||
- [Remote - Cerebras](./remote_cerebras)
|
||||
- [Remote - Databricks](./remote_databricks)
|
||||
- [Remote - Fireworks](./remote_fireworks)
|
||||
- [Remote - Gemini](./remote_gemini)
|
||||
- [Remote - Groq](./remote_groq)
|
||||
- [Remote - Hf - Endpoint](./remote_hf_endpoint)
|
||||
- [Remote - Hf - Serverless](./remote_hf_serverless)
|
||||
- [Remote - Llama-Openai-Compat](./remote_llama-openai-compat)
|
||||
- [Remote - Nvidia](./remote_nvidia)
|
||||
- [Remote - Ollama](./remote_ollama)
|
||||
- [Remote - Openai](./remote_openai)
|
||||
- [Remote - Passthrough](./remote_passthrough)
|
||||
- [Remote - Runpod](./remote_runpod)
|
||||
- [Remote - Sambanova](./remote_sambanova)
|
||||
- [Remote - Tgi](./remote_tgi)
|
||||
- [Remote - Together](./remote_together)
|
||||
- [Remote - Vertexai](./remote_vertexai)
|
||||
- [Remote - Vllm](./remote_vllm)
|
||||
- [Remote - Watsonx](./remote_watsonx)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Meta's reference implementation of inference with support for various
|
||||
model formats and optimization techniques
|
||||
sidebar_label: Meta Reference
|
||||
sidebar_position: 2
|
||||
description: "Meta's reference implementation of inference with support for various model formats and optimization techniques."
|
||||
sidebar_label: Meta-Reference
|
||||
title: inline::meta-reference
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Sentence Transformers inference provider for text embeddings and similarity
|
||||
search
|
||||
sidebar_label: Sentence Transformers
|
||||
sidebar_position: 3
|
||||
description: "Sentence Transformers inference provider for text embeddings and similarity search."
|
||||
sidebar_label: Sentence-Transformers
|
||||
title: inline::sentence-transformers
|
||||
---
|
||||
|
||||
|
@ -12,10 +10,6 @@ title: inline::sentence-transformers
|
|||
|
||||
Sentence Transformers inference provider for text embeddings and similarity search.
|
||||
|
||||
## Configuration
|
||||
|
||||
No configuration options available.
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Anthropic inference provider for accessing Claude models and Anthropic's
|
||||
AI services
|
||||
sidebar_label: Anthropic
|
||||
sidebar_position: 4
|
||||
description: "Anthropic inference provider for accessing Claude models and Anthropic's AI services."
|
||||
sidebar_label: Remote - Anthropic
|
||||
title: remote::anthropic
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
---
|
||||
description: Azure OpenAI inference provider for accessing GPT models and other Azure
|
||||
services
|
||||
sidebar_label: Azure
|
||||
sidebar_position: 5
|
||||
description: |
|
||||
Azure OpenAI inference provider for accessing GPT models and other Azure services.
|
||||
Provider documentation
|
||||
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
|
||||
sidebar_label: Remote - Azure
|
||||
title: remote::azure
|
||||
---
|
||||
|
||||
|
@ -10,10 +11,12 @@ title: remote::azure
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
Azure OpenAI inference provider for accessing GPT models and other Azure services.
|
||||
Provider documentation
|
||||
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: AWS Bedrock inference provider for accessing various AI models through
|
||||
AWS's managed service
|
||||
sidebar_label: Bedrock
|
||||
sidebar_position: 6
|
||||
description: "AWS Bedrock inference provider for accessing various AI models through AWS's managed service."
|
||||
sidebar_label: Remote - Bedrock
|
||||
title: remote::bedrock
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Cerebras inference provider for running models on Cerebras Cloud platform
|
||||
sidebar_label: Cerebras
|
||||
sidebar_position: 7
|
||||
description: "Cerebras inference provider for running models on Cerebras Cloud platform."
|
||||
sidebar_label: Remote - Cerebras
|
||||
title: remote::cerebras
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Databricks inference provider for running models on Databricks' unified
|
||||
analytics platform
|
||||
sidebar_label: Databricks
|
||||
sidebar_position: 8
|
||||
description: "Databricks inference provider for running models on Databricks' unified analytics platform."
|
||||
sidebar_label: Remote - Databricks
|
||||
title: remote::databricks
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Fireworks AI inference provider for Llama models and other AI models
|
||||
on the Fireworks platform
|
||||
sidebar_label: Fireworks
|
||||
sidebar_position: 9
|
||||
description: "Fireworks AI inference provider for Llama models and other AI models on the Fireworks platform."
|
||||
sidebar_label: Remote - Fireworks
|
||||
title: remote::fireworks
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Google Gemini inference provider for accessing Gemini models and Google's
|
||||
AI services
|
||||
sidebar_label: Gemini
|
||||
sidebar_position: 10
|
||||
description: "Google Gemini inference provider for accessing Gemini models and Google's AI services."
|
||||
sidebar_label: Remote - Gemini
|
||||
title: remote::gemini
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Groq inference provider for ultra-fast inference using Groq's LPU technology
|
||||
sidebar_label: Groq
|
||||
sidebar_position: 11
|
||||
description: "Groq inference provider for ultra-fast inference using Groq's LPU technology."
|
||||
sidebar_label: Remote - Groq
|
||||
title: remote::groq
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: HuggingFace Inference Endpoints provider for dedicated model serving
|
||||
sidebar_label: Hugging Face Endpoint
|
||||
sidebar_position: 12
|
||||
description: "HuggingFace Inference Endpoints provider for dedicated model serving."
|
||||
sidebar_label: Remote - Hf - Endpoint
|
||||
title: remote::hf::endpoint
|
||||
---
|
||||
|
||||
|
@ -15,8 +14,8 @@ HuggingFace Inference Endpoints provider for dedicated model serving.
|
|||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `endpoint_name` | `<class 'str'>` | No | | The name of the Hugging Face Inference Endpoint in the format of `{namespace}/{endpoint_name}` (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
|
||||
| `api_token` | `pydantic.types.SecretStr or None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
|
||||
| `endpoint_name` | `<class 'str'>` | No | | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
|
||||
| `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: HuggingFace Inference API serverless provider for on-demand model inference
|
||||
sidebar_label: Hugging Face Serverless
|
||||
sidebar_position: 13
|
||||
description: "HuggingFace Inference API serverless provider for on-demand model inference."
|
||||
sidebar_label: Remote - Hf - Serverless
|
||||
title: remote::hf::serverless
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Llama OpenAI-compatible provider for using Llama models with OpenAI API
|
||||
format
|
||||
sidebar_label: Llama OpenAI Compatible
|
||||
sidebar_position: 14
|
||||
description: "Llama OpenAI-compatible provider for using Llama models with OpenAI API format."
|
||||
sidebar_label: Remote - Llama-Openai-Compat
|
||||
title: remote::llama-openai-compat
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: NVIDIA inference provider for accessing NVIDIA NIM models and AI services
|
||||
sidebar_label: Nvidia
|
||||
sidebar_position: 15
|
||||
description: "NVIDIA inference provider for accessing NVIDIA NIM models and AI services."
|
||||
sidebar_label: Remote - Nvidia
|
||||
title: remote::nvidia
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Ollama inference provider for running local models through the Ollama
|
||||
runtime
|
||||
sidebar_label: Ollama
|
||||
sidebar_position: 16
|
||||
description: "Ollama inference provider for running local models through the Ollama runtime."
|
||||
sidebar_label: Remote - Ollama
|
||||
title: remote::ollama
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: OpenAI inference provider for accessing GPT models and other OpenAI services
|
||||
sidebar_label: Openai
|
||||
sidebar_position: 17
|
||||
description: "OpenAI inference provider for accessing GPT models and other OpenAI services."
|
||||
sidebar_label: Remote - Openai
|
||||
title: remote::openai
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Passthrough inference provider for connecting to any external inference
|
||||
service not directly supported
|
||||
sidebar_label: Passthrough
|
||||
sidebar_position: 18
|
||||
description: "Passthrough inference provider for connecting to any external inference service not directly supported."
|
||||
sidebar_label: Remote - Passthrough
|
||||
title: remote::passthrough
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: RunPod inference provider for running models on RunPod's cloud GPU platform
|
||||
sidebar_label: Runpod
|
||||
sidebar_position: 19
|
||||
description: "RunPod inference provider for running models on RunPod's cloud GPU platform."
|
||||
sidebar_label: Remote - Runpod
|
||||
title: remote::runpod
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: SambaNova inference provider for running models on SambaNova's dataflow
|
||||
architecture
|
||||
sidebar_label: Sambanova
|
||||
sidebar_position: 20
|
||||
description: "SambaNova inference provider for running models on SambaNova's dataflow architecture."
|
||||
sidebar_label: Remote - Sambanova
|
||||
title: remote::sambanova
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Text Generation Inference (TGI) provider for HuggingFace model serving
|
||||
sidebar_label: Tgi
|
||||
sidebar_position: 22
|
||||
description: "Text Generation Inference (TGI) provider for HuggingFace model serving."
|
||||
sidebar_label: Remote - Tgi
|
||||
title: remote::tgi
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Together AI inference provider for open-source models and collaborative
|
||||
AI development
|
||||
sidebar_label: Together
|
||||
sidebar_position: 23
|
||||
description: "Together AI inference provider for open-source models and collaborative AI development."
|
||||
sidebar_label: Remote - Together
|
||||
title: remote::together
|
||||
---
|
||||
|
||||
|
|
|
@ -1,18 +1,26 @@
|
|||
---
|
||||
description: "Google Vertex AI inference provider enables you to use Google's Gemini\
|
||||
\ models through Google Cloud's Vertex AI platform, providing several advantages:\n\
|
||||
\n\u2022 Enterprise-grade security: Uses Google Cloud's security controls and IAM\n\
|
||||
\u2022 Better integration: Seamless integration with other Google Cloud services\n\
|
||||
\u2022 Advanced features: Access to additional Vertex AI features like model tuning\
|
||||
\ and monitoring\n\u2022 Authentication: Uses Google Cloud Application Default Credentials\
|
||||
\ (ADC) instead of API keys\n\nConfiguration:\n- Set VERTEX_AI_PROJECT environment\
|
||||
\ variable (required)\n- Set VERTEX_AI_LOCATION environment variable (optional,\
|
||||
\ defaults to us-central1)\n- Use Google Cloud Application Default Credentials or\
|
||||
\ service account key\n\nAuthentication Setup:\nOption 1 (Recommended): gcloud auth\
|
||||
\ application-default login\nOption 2: Set GOOGLE_APPLICATION_CREDENTIALS to service\
|
||||
\ account key path\n\nAvailable Models:\n- vertex_ai/gemini-2"
|
||||
sidebar_label: Vertexai
|
||||
sidebar_position: 24
|
||||
description: |
|
||||
Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages:
|
||||
|
||||
• Enterprise-grade security: Uses Google Cloud's security controls and IAM
|
||||
• Better integration: Seamless integration with other Google Cloud services
|
||||
• Advanced features: Access to additional Vertex AI features like model tuning and monitoring
|
||||
• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys
|
||||
|
||||
Configuration:
|
||||
- Set VERTEX_AI_PROJECT environment variable (required)
|
||||
- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1)
|
||||
- Use Google Cloud Application Default Credentials or service account key
|
||||
|
||||
Authentication Setup:
|
||||
Option 1 (Recommended): gcloud auth application-default login
|
||||
Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path
|
||||
|
||||
Available Models:
|
||||
- vertex_ai/gemini-2.0-flash
|
||||
- vertex_ai/gemini-2.5-flash
|
||||
- vertex_ai/gemini-2.5-pro
|
||||
sidebar_label: Remote - Vertexai
|
||||
title: remote::vertexai
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Remote vLLM inference provider for connecting to vLLM servers
|
||||
sidebar_label: Vllm
|
||||
sidebar_position: 25
|
||||
description: "Remote vLLM inference provider for connecting to vLLM servers."
|
||||
sidebar_label: Remote - Vllm
|
||||
title: remote::vllm
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: IBM WatsonX inference provider for accessing AI models on IBM's WatsonX
|
||||
platform
|
||||
sidebar_label: Watsonx
|
||||
sidebar_position: 26
|
||||
description: "IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform."
|
||||
sidebar_label: Remote - Watsonx
|
||||
title: remote::watsonx
|
||||
---
|
||||
|
||||
|
@ -17,8 +15,8 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
|
|||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
|
||||
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The watsonx API key, only needed of using the hosted service |
|
||||
| `project_id` | `str \| None` | No | | The Project ID key, only needed of using the hosted service |
|
||||
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The watsonx API key |
|
||||
| `project_id` | `str \| None` | No | | The Project ID key |
|
||||
| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
|
||||
|
||||
## Sample Configuration
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
---
|
||||
description: Available providers for the post_training API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
sidebar_label: Post Training
|
||||
title: Post_Training
|
||||
---
|
||||
|
||||
|
@ -13,10 +11,7 @@ This section contains documentation for all available providers for the **post_t
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Huggingface](./inline_huggingface)** - Inline provider
|
||||
- **[Huggingface Cpu](./inline_huggingface-cpu)** - Inline provider
|
||||
- **[Huggingface Gpu](./inline_huggingface-gpu)** - Inline provider
|
||||
- **[Torchtune](./inline_torchtune)** - Inline provider
|
||||
- **[Torchtune Cpu](./inline_torchtune-cpu)** - Inline provider
|
||||
- **[Torchtune Gpu](./inline_torchtune-gpu)** - Inline provider
|
||||
- **[Nvidia](./remote_nvidia)** - Remote provider
|
||||
- [Huggingface-Gpu](./inline_huggingface-gpu)
|
||||
- [Torchtune-Cpu](./inline_torchtune-cpu)
|
||||
- [Torchtune-Gpu](./inline_torchtune-gpu)
|
||||
- [Remote - Nvidia](./remote_nvidia)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: HuggingFace-based post-training provider for fine-tuning models using
|
||||
the HuggingFace ecosystem
|
||||
sidebar_label: Huggingface Gpu
|
||||
sidebar_position: 4
|
||||
description: "HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem."
|
||||
sidebar_label: Huggingface-Gpu
|
||||
title: inline::huggingface-gpu
|
||||
---
|
||||
|
||||
|
@ -17,10 +15,13 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
|
|||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `device` | `<class 'str'>` | No | cuda | |
|
||||
| `distributed_backend` | `Literal['fsdp', 'deepspeed']` | No | | |
|
||||
| `checkpoint_format` | `Literal['full_state', 'huggingface']` | No | huggingface | |
|
||||
| `chat_template` | `<class 'str'>` | No | `<\|user\|>{input}<\|assistant\|>{output}` | |
|
||||
| `model_specific_config` | `<class 'dict'>` | No | `{'trust_remote_code': True, 'attn_implementation': 'sdpa'}` | |
|
||||
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
|
||||
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
|
||||
| `chat_template` | `<class 'str'>` | No | <|user|>
|
||||
{input}
|
||||
<|assistant|>
|
||||
{output} | |
|
||||
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
|
||||
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
|
||||
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
|
||||
| `save_total_limit` | `<class 'int'>` | No | 3 | |
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: TorchTune-based post-training provider for fine-tuning and optimizing
|
||||
models using Meta's TorchTune framework
|
||||
sidebar_label: Torchtune Cpu
|
||||
sidebar_position: 6
|
||||
description: "TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework."
|
||||
sidebar_label: Torchtune-Cpu
|
||||
title: inline::torchtune-cpu
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: TorchTune-based post-training provider for fine-tuning and optimizing
|
||||
models using Meta's TorchTune framework
|
||||
sidebar_label: Torchtune Gpu
|
||||
sidebar_position: 7
|
||||
description: "TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework."
|
||||
sidebar_label: Torchtune-Gpu
|
||||
title: inline::torchtune-gpu
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform
|
||||
sidebar_label: Nvidia
|
||||
sidebar_position: 8
|
||||
description: "NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform."
|
||||
sidebar_label: Remote - Nvidia
|
||||
title: remote::nvidia
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
---
|
||||
description: Available providers for the safety API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
sidebar_label: Safety
|
||||
title: Safety
|
||||
---
|
||||
|
||||
|
@ -13,9 +11,9 @@ This section contains documentation for all available providers for the **safety
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Code Scanner](./inline_code-scanner)** - Inline provider
|
||||
- **[Llama Guard](./inline_llama-guard)** - Inline provider
|
||||
- **[Prompt Guard](./inline_prompt-guard)** - Inline provider
|
||||
- **[Bedrock](./remote_bedrock)** - Remote provider
|
||||
- **[Nvidia](./remote_nvidia)** - Remote provider
|
||||
- **[Sambanova](./remote_sambanova)** - Remote provider
|
||||
- [Code-Scanner](./inline_code-scanner)
|
||||
- [Llama-Guard](./inline_llama-guard)
|
||||
- [Prompt-Guard](./inline_prompt-guard)
|
||||
- [Remote - Bedrock](./remote_bedrock)
|
||||
- [Remote - Nvidia](./remote_nvidia)
|
||||
- [Remote - Sambanova](./remote_sambanova)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Code Scanner safety provider for detecting security vulnerabilities and
|
||||
unsafe code patterns
|
||||
sidebar_label: Code Scanner
|
||||
sidebar_position: 2
|
||||
description: "Code Scanner safety provider for detecting security vulnerabilities and unsafe code patterns."
|
||||
sidebar_label: Code-Scanner
|
||||
title: inline::code-scanner
|
||||
---
|
||||
|
||||
|
@ -12,10 +10,6 @@ title: inline::code-scanner
|
|||
|
||||
Code Scanner safety provider for detecting security vulnerabilities and unsafe code patterns.
|
||||
|
||||
## Configuration
|
||||
|
||||
No configuration options available.
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Llama Guard safety provider for content moderation and safety filtering
|
||||
using Meta's Llama Guard model
|
||||
sidebar_label: Llama Guard
|
||||
sidebar_position: 3
|
||||
description: "Llama Guard safety provider for content moderation and safety filtering using Meta's Llama Guard model."
|
||||
sidebar_label: Llama-Guard
|
||||
title: inline::llama-guard
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Prompt Guard safety provider for detecting and filtering unsafe prompts
|
||||
and content
|
||||
sidebar_label: Prompt Guard
|
||||
sidebar_position: 4
|
||||
description: "Prompt Guard safety provider for detecting and filtering unsafe prompts and content."
|
||||
sidebar_label: Prompt-Guard
|
||||
title: inline::prompt-guard
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: AWS Bedrock safety provider for content moderation using AWS's safety
|
||||
services
|
||||
sidebar_label: Bedrock
|
||||
sidebar_position: 5
|
||||
description: "AWS Bedrock safety provider for content moderation using AWS's safety services."
|
||||
sidebar_label: Remote - Bedrock
|
||||
title: remote::bedrock
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: NVIDIA's safety provider for content moderation and safety filtering
|
||||
sidebar_label: Nvidia
|
||||
sidebar_position: 6
|
||||
description: "NVIDIA's safety provider for content moderation and safety filtering."
|
||||
sidebar_label: Remote - Nvidia
|
||||
title: remote::nvidia
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: SambaNova's safety provider for content moderation and safety filtering
|
||||
sidebar_label: Sambanova
|
||||
sidebar_position: 7
|
||||
description: "SambaNova's safety provider for content moderation and safety filtering."
|
||||
sidebar_label: Remote - Sambanova
|
||||
title: remote::sambanova
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
---
|
||||
description: Available providers for the scoring API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
sidebar_label: Scoring
|
||||
title: Scoring
|
||||
---
|
||||
|
||||
|
@ -13,6 +11,6 @@ This section contains documentation for all available providers for the **scorin
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Basic](./inline_basic)** - Inline provider
|
||||
- **[Braintrust](./inline_braintrust)** - Inline provider
|
||||
- **[Llm As Judge](./inline_llm-as-judge)** - Inline provider
|
||||
- [Basic](./inline_basic)
|
||||
- [Braintrust](./inline_braintrust)
|
||||
- [Llm-As-Judge](./inline_llm-as-judge)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Basic scoring provider for simple evaluation metrics and scoring functions
|
||||
description: "Basic scoring provider for simple evaluation metrics and scoring functions."
|
||||
sidebar_label: Basic
|
||||
sidebar_position: 2
|
||||
title: inline::basic
|
||||
---
|
||||
|
||||
|
@ -11,10 +10,6 @@ title: inline::basic
|
|||
|
||||
Basic scoring provider for simple evaluation metrics and scoring functions.
|
||||
|
||||
## Configuration
|
||||
|
||||
No configuration options available.
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Braintrust scoring provider for evaluation and scoring using the Braintrust
|
||||
platform
|
||||
description: "Braintrust scoring provider for evaluation and scoring using the Braintrust platform."
|
||||
sidebar_label: Braintrust
|
||||
sidebar_position: 3
|
||||
title: inline::braintrust
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: LLM-as-judge scoring provider that uses language models to evaluate and
|
||||
score responses
|
||||
sidebar_label: Llm As Judge
|
||||
sidebar_position: 4
|
||||
description: "LLM-as-judge scoring provider that uses language models to evaluate and score responses."
|
||||
sidebar_label: Llm-As-Judge
|
||||
title: inline::llm-as-judge
|
||||
---
|
||||
|
||||
|
@ -12,10 +10,6 @@ title: inline::llm-as-judge
|
|||
|
||||
LLM-as-judge scoring provider that uses language models to evaluate and score responses.
|
||||
|
||||
## Configuration
|
||||
|
||||
No configuration options available.
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
---
|
||||
description: Available providers for the telemetry API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
sidebar_label: Telemetry
|
||||
title: Telemetry
|
||||
---
|
||||
|
||||
|
@ -13,4 +11,4 @@ This section contains documentation for all available providers for the **teleme
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Meta Reference](./inline_meta-reference)** - Inline provider
|
||||
- [Meta-Reference](./inline_meta-reference)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Meta's reference implementation of telemetry and observability using
|
||||
OpenTelemetry
|
||||
sidebar_label: Meta Reference
|
||||
sidebar_position: 2
|
||||
description: "Meta's reference implementation of telemetry and observability using OpenTelemetry."
|
||||
sidebar_label: Meta-Reference
|
||||
title: inline::meta-reference
|
||||
---
|
||||
|
||||
|
@ -16,9 +14,9 @@ Meta's reference implementation of telemetry and observability using OpenTelemet
|
|||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `otel_exporter_otlp_endpoint` | `str or None` | No | | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. |
|
||||
| `otel_exporter_otlp_endpoint` | `str \| None` | No | | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. |
|
||||
| `service_name` | `<class 'str'>` | No | | The service name to use for telemetry |
|
||||
| `sinks` | `list[TelemetrySink]` | No | `[CONSOLE, SQLITE]` | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) |
|
||||
| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [<TelemetrySink.CONSOLE: 'console'>, <TelemetrySink.SQLITE: 'sqlite'>] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) |
|
||||
| `sqlite_db_path` | `<class 'str'>` | No | ~/.llama/runtime/trace_store.db | The path to the SQLite database to use for storing traces |
|
||||
|
||||
## Sample Configuration
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
---
|
||||
description: Available providers for the tool_runtime API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
sidebar_label: Tool Runtime
|
||||
title: Tool_Runtime
|
||||
---
|
||||
|
||||
|
@ -13,9 +11,9 @@ This section contains documentation for all available providers for the **tool_r
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Rag Runtime](./inline_rag-runtime)** - Inline provider
|
||||
- **[Bing Search](./remote_bing-search)** - Remote provider
|
||||
- **[Brave Search](./remote_brave-search)** - Remote provider
|
||||
- **[Model Context Protocol](./remote_model-context-protocol)** - Remote provider
|
||||
- **[Tavily Search](./remote_tavily-search)** - Remote provider
|
||||
- **[Wolfram Alpha](./remote_wolfram-alpha)** - Remote provider
|
||||
- [Rag-Runtime](./inline_rag-runtime)
|
||||
- [Remote - Bing-Search](./remote_bing-search)
|
||||
- [Remote - Brave-Search](./remote_brave-search)
|
||||
- [Remote - Model-Context-Protocol](./remote_model-context-protocol)
|
||||
- [Remote - Tavily-Search](./remote_tavily-search)
|
||||
- [Remote - Wolfram-Alpha](./remote_wolfram-alpha)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: RAG (Retrieval-Augmented Generation) tool runtime for document ingestion,
|
||||
chunking, and semantic search
|
||||
sidebar_label: Rag Runtime
|
||||
sidebar_position: 2
|
||||
description: "RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search."
|
||||
sidebar_label: Rag-Runtime
|
||||
title: inline::rag-runtime
|
||||
---
|
||||
|
||||
|
@ -12,10 +10,6 @@ title: inline::rag-runtime
|
|||
|
||||
RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.
|
||||
|
||||
## Configuration
|
||||
|
||||
No configuration options available.
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Bing Search tool for web search capabilities using Microsoft's search
|
||||
engine
|
||||
sidebar_label: Bing Search
|
||||
sidebar_position: 3
|
||||
description: "Bing Search tool for web search capabilities using Microsoft's search engine."
|
||||
sidebar_label: Remote - Bing-Search
|
||||
title: remote::bing-search
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Brave Search tool for web search capabilities with privacy-focused results
|
||||
sidebar_label: Brave Search
|
||||
sidebar_position: 4
|
||||
description: "Brave Search tool for web search capabilities with privacy-focused results."
|
||||
sidebar_label: Remote - Brave-Search
|
||||
title: remote::brave-search
|
||||
---
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
---
|
||||
description: Model Context Protocol (MCP) tool for standardized tool calling and context
|
||||
management
|
||||
sidebar_label: Model Context Protocol
|
||||
sidebar_position: 5
|
||||
description: "Model Context Protocol (MCP) tool for standardized tool calling and context management."
|
||||
sidebar_label: Remote - Model-Context-Protocol
|
||||
title: remote::model-context-protocol
|
||||
---
|
||||
|
||||
|
@ -12,10 +10,6 @@ title: remote::model-context-protocol
|
|||
|
||||
Model Context Protocol (MCP) tool for standardized tool calling and context management.
|
||||
|
||||
## Configuration
|
||||
|
||||
No configuration options available.
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Tavily Search tool for AI-optimized web search with structured results
|
||||
sidebar_label: Tavily Search
|
||||
sidebar_position: 6
|
||||
description: "Tavily Search tool for AI-optimized web search with structured results."
|
||||
sidebar_label: Remote - Tavily-Search
|
||||
title: remote::tavily-search
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Wolfram Alpha tool for computational knowledge and mathematical calculations
|
||||
sidebar_label: Wolfram Alpha
|
||||
sidebar_position: 7
|
||||
description: "Wolfram Alpha tool for computational knowledge and mathematical calculations."
|
||||
sidebar_label: Remote - Wolfram-Alpha
|
||||
title: remote::wolfram-alpha
|
||||
---
|
||||
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
---
|
||||
description: Available providers for the vector_io API
|
||||
sidebar_label: Overview
|
||||
sidebar_position: 1
|
||||
sidebar_label: Vector Io
|
||||
title: Vector_Io
|
||||
---
|
||||
|
||||
|
@ -13,15 +11,15 @@ This section contains documentation for all available providers for the **vector
|
|||
|
||||
## Providers
|
||||
|
||||
- **[Chromadb](./inline_chromadb)** - Inline provider
|
||||
- **[Faiss](./inline_faiss)** - Inline provider
|
||||
- **[Meta Reference](./inline_meta-reference)** - Inline provider
|
||||
- **[Milvus](./inline_milvus)** - Inline provider
|
||||
- **[Qdrant](./inline_qdrant)** - Inline provider
|
||||
- **[SQLite-Vec](./inline_sqlite-vec)** - Inline provider
|
||||
- **[SQLite-Vec](./inline_sqlite_vec)** - Inline provider
|
||||
- **[Chromadb](./remote_chromadb)** - Remote provider
|
||||
- **[Milvus](./remote_milvus)** - Remote provider
|
||||
- **[Pgvector](./remote_pgvector)** - Remote provider
|
||||
- **[Qdrant](./remote_qdrant)** - Remote provider
|
||||
- **[Weaviate](./remote_weaviate)** - Remote provider
|
||||
- [Chromadb](./inline_chromadb)
|
||||
- [Faiss](./inline_faiss)
|
||||
- [Meta-Reference](./inline_meta-reference)
|
||||
- [Milvus](./inline_milvus)
|
||||
- [Qdrant](./inline_qdrant)
|
||||
- [Sqlite-Vec](./inline_sqlite-vec)
|
||||
- [Sqlite Vec](./inline_sqlite_vec)
|
||||
- [Remote - Chromadb](./remote_chromadb)
|
||||
- [Remote - Milvus](./remote_milvus)
|
||||
- [Remote - Pgvector](./remote_pgvector)
|
||||
- [Remote - Qdrant](./remote_qdrant)
|
||||
- [Remote - Weaviate](./remote_weaviate)
|
||||
|
|
|
@ -1,7 +1,37 @@
|
|||
---
|
||||
description: '[Chroma](https://www'
|
||||
description: |
|
||||
[Chroma](https://www.trychroma.com/) is an inline and remote vector
|
||||
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
|
||||
That means you're not limited to storing vectors in memory or in a separate service.
|
||||
|
||||
## Features
|
||||
Chroma supports:
|
||||
- Store embeddings and their metadata
|
||||
- Vector search
|
||||
- Full-text search
|
||||
- Document storage
|
||||
- Metadata filtering
|
||||
- Multi-modal retrieval
|
||||
|
||||
## Usage
|
||||
|
||||
To use Chrome in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use chroma.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
You can install chroma using pip:
|
||||
|
||||
```bash
|
||||
pip install chromadb
|
||||
```
|
||||
|
||||
## Documentation
|
||||
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
|
||||
sidebar_label: Chromadb
|
||||
sidebar_position: 2
|
||||
title: inline::chromadb
|
||||
---
|
||||
|
||||
|
@ -9,10 +39,41 @@ title: inline::chromadb
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
[Chroma](https://www.trychroma.com/) is an inline and remote vector
|
||||
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
|
||||
That means you're not limited to storing vectors in memory or in a separate service.
|
||||
|
||||
## Features
|
||||
Chroma supports:
|
||||
- Store embeddings and their metadata
|
||||
- Vector search
|
||||
- Full-text search
|
||||
- Document storage
|
||||
- Metadata filtering
|
||||
- Multi-modal retrieval
|
||||
|
||||
## Usage
|
||||
|
||||
To use Chrome in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use chroma.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
You can install chroma using pip:
|
||||
|
||||
```bash
|
||||
pip install chromadb
|
||||
```
|
||||
|
||||
## Documentation
|
||||
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
|
||||
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|
|
|
@ -1,7 +1,46 @@
|
|||
---
|
||||
description: '[Faiss](https://github'
|
||||
description: |
|
||||
[Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It
|
||||
allows you to store and query vectors directly in memory.
|
||||
That means you'll get fast and efficient vector retrieval.
|
||||
|
||||
## Features
|
||||
|
||||
- Lightweight and easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
- GPU support
|
||||
- **Vector search** - FAISS supports pure vector similarity search using embeddings
|
||||
|
||||
## Search Modes
|
||||
|
||||
**Supported:**
|
||||
- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
|
||||
|
||||
**Not Supported:**
|
||||
- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
|
||||
- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
|
||||
|
||||
> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
|
||||
|
||||
## Usage
|
||||
|
||||
To use Faiss in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use Faiss.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
You can install Faiss using pip:
|
||||
|
||||
```bash
|
||||
pip install faiss-cpu
|
||||
```
|
||||
## Documentation
|
||||
See [Faiss' documentation](https://faiss.ai/) or the [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki) for
|
||||
more details about Faiss in general.
|
||||
sidebar_label: Faiss
|
||||
sidebar_position: 3
|
||||
title: inline::faiss
|
||||
---
|
||||
|
||||
|
@ -9,10 +48,49 @@ title: inline::faiss
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
[Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It
|
||||
allows you to store and query vectors directly in memory.
|
||||
That means you'll get fast and efficient vector retrieval.
|
||||
|
||||
## Features
|
||||
|
||||
- Lightweight and easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
- GPU support
|
||||
- **Vector search** - FAISS supports pure vector similarity search using embeddings
|
||||
|
||||
## Search Modes
|
||||
|
||||
**Supported:**
|
||||
- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
|
||||
|
||||
**Not Supported:**
|
||||
- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
|
||||
- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
|
||||
|
||||
> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
|
||||
|
||||
## Usage
|
||||
|
||||
To use Faiss in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use Faiss.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
You can install Faiss using pip:
|
||||
|
||||
```bash
|
||||
pip install faiss-cpu
|
||||
```
|
||||
## Documentation
|
||||
See [Faiss' documentation](https://faiss.ai/) or the [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki) for
|
||||
more details about Faiss in general.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Meta's reference implementation of a vector database
|
||||
sidebar_label: Meta Reference
|
||||
sidebar_position: 4
|
||||
description: "Meta's reference implementation of a vector database."
|
||||
sidebar_label: Meta-Reference
|
||||
title: inline::meta-reference
|
||||
---
|
||||
|
||||
|
@ -24,3 +23,9 @@ kvstore:
|
|||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db
|
||||
```
|
||||
## Deprecation Notice
|
||||
|
||||
:::warning
|
||||
Please use the `inline::faiss` provider instead.
|
||||
:::
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Please refer to the remote provider documentation
|
||||
description: "Please refer to the remote provider documentation."
|
||||
sidebar_label: Milvus
|
||||
sidebar_position: 5
|
||||
title: inline::milvus
|
||||
---
|
||||
|
||||
|
@ -9,8 +8,10 @@ title: inline::milvus
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
Please refer to the remote provider documentation.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|
|
|
@ -1,7 +1,47 @@
|
|||
---
|
||||
description: '[Qdrant](https://qdrant'
|
||||
description: |
|
||||
[Qdrant](https://qdrant.tech/documentation/) is an inline and remote vector database provider for Llama Stack. It
|
||||
allows you to store and query vectors directly in memory.
|
||||
That means you'll get fast and efficient vector retrieval.
|
||||
|
||||
> By default, Qdrant stores vectors in RAM, delivering incredibly fast access for datasets that fit comfortably in
|
||||
> memory. But when your dataset exceeds RAM capacity, Qdrant offers Memmap as an alternative.
|
||||
>
|
||||
> \[[An Introduction to Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database/)\]
|
||||
|
||||
|
||||
|
||||
## Features
|
||||
|
||||
- Lightweight and easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
- Apache 2.0 license terms
|
||||
- Store embeddings and their metadata
|
||||
- Supports search by
|
||||
[Keyword](https://qdrant.tech/articles/qdrant-introduces-full-text-filters-and-indexes/)
|
||||
and [Hybrid](https://qdrant.tech/articles/hybrid-search/#building-a-hybrid-search-system-in-qdrant) search
|
||||
- [Multilingual and Multimodal retrieval](https://qdrant.tech/documentation/multimodal-search/)
|
||||
- [Medatata filtering](https://qdrant.tech/articles/vector-search-filtering/)
|
||||
- [GPU support](https://qdrant.tech/documentation/guides/running-with-gpu/)
|
||||
|
||||
## Usage
|
||||
|
||||
To use Qdrant in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use Qdrant.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
You can install Qdrant using docker:
|
||||
|
||||
```bash
|
||||
docker pull qdrant/qdrant
|
||||
```
|
||||
## Documentation
|
||||
See the [Qdrant documentation](https://qdrant.tech/documentation/) for more details about Qdrant in general.
|
||||
sidebar_label: Qdrant
|
||||
sidebar_position: 6
|
||||
title: inline::qdrant
|
||||
---
|
||||
|
||||
|
@ -9,6 +49,7 @@ title: inline::qdrant
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
[Qdrant](https://qdrant.tech/documentation/) is an inline and remote vector database provider for Llama Stack. It
|
||||
allows you to store and query vectors directly in memory.
|
||||
That means you'll get fast and efficient vector retrieval.
|
||||
|
@ -18,6 +59,40 @@ That means you'll get fast and efficient vector retrieval.
|
|||
>
|
||||
> \[[An Introduction to Vector Databases](https://qdrant.tech/articles/what-is-a-vector-database/)\]
|
||||
|
||||
|
||||
|
||||
## Features
|
||||
|
||||
- Lightweight and easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
- Apache 2.0 license terms
|
||||
- Store embeddings and their metadata
|
||||
- Supports search by
|
||||
[Keyword](https://qdrant.tech/articles/qdrant-introduces-full-text-filters-and-indexes/)
|
||||
and [Hybrid](https://qdrant.tech/articles/hybrid-search/#building-a-hybrid-search-system-in-qdrant) search
|
||||
- [Multilingual and Multimodal retrieval](https://qdrant.tech/documentation/multimodal-search/)
|
||||
- [Medatata filtering](https://qdrant.tech/articles/vector-search-filtering/)
|
||||
- [GPU support](https://qdrant.tech/documentation/guides/running-with-gpu/)
|
||||
|
||||
## Usage
|
||||
|
||||
To use Qdrant in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use Qdrant.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
You can install Qdrant using docker:
|
||||
|
||||
```bash
|
||||
docker pull qdrant/qdrant
|
||||
```
|
||||
## Documentation
|
||||
See the [Qdrant documentation](https://qdrant.tech/documentation/) for more details about Qdrant in general.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|
|
|
@ -1,7 +1,202 @@
|
|||
---
|
||||
description: '[SQLite-Vec](https://github'
|
||||
sidebar_label: SQLite-Vec
|
||||
sidebar_position: 7
|
||||
description: |
|
||||
[SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It
|
||||
allows you to store and query vectors directly within an SQLite database.
|
||||
That means you're not limited to storing vectors in memory or in a separate service.
|
||||
|
||||
## Features
|
||||
|
||||
- Lightweight and easy to use
|
||||
- Fully integrated with Llama Stacks
|
||||
- Uses disk-based storage for persistence, allowing for larger vector storage
|
||||
|
||||
### Comparison to Faiss
|
||||
|
||||
The choice between Faiss and sqlite-vec should be made based on the needs of your application,
|
||||
as they have different strengths.
|
||||
|
||||
#### Choosing the Right Provider
|
||||
|
||||
Scenario | Recommended Tool | Reason
|
||||
-- |-----------------| --
|
||||
Online Analytical Processing (OLAP) | Faiss | Fast, in-memory searches
|
||||
Online Transaction Processing (OLTP) | sqlite-vec | Frequent writes and reads
|
||||
Frequent writes | sqlite-vec | Efficient disk-based storage and incremental indexing
|
||||
Large datasets | sqlite-vec | Disk-based storage for larger vector storage
|
||||
Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
|
||||
|
||||
#### Empirical Example
|
||||
|
||||
Consider the histogram below in which 10,000 randomly generated strings were inserted
|
||||
in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
|
||||
|
||||
```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
|
||||
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
You will notice that the average write time for `sqlite-vec` was 788ms, compared to
|
||||
47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
|
||||
uniformly spread across the [1500, 100000] interval.
|
||||
|
||||
Looking at each individual write in the order that the documents are inserted you'll see the increase in
|
||||
write speed as Faiss reindexes the vectors after each write.
|
||||
```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
|
||||
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
|
||||
The modes of the two distributions highlight the differences much further where Faiss
|
||||
will likely yield faster read performance.
|
||||
|
||||
```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
|
||||
:alt: Comparison of SQLite-Vec and Faiss read times
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
To use sqlite-vec in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use SQLite-Vec.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
The SQLite-vec provider supports three search modes:
|
||||
|
||||
1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings.
|
||||
2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5.
|
||||
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates.
|
||||
|
||||
Example with hybrid search:
|
||||
```python
|
||||
response = await vector_io.query_chunks(
|
||||
vector_db_id="my_db",
|
||||
query="your query here",
|
||||
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
|
||||
)
|
||||
|
||||
# Using RRF ranker
|
||||
response = await vector_io.query_chunks(
|
||||
vector_db_id="my_db",
|
||||
query="your query here",
|
||||
params={
|
||||
"mode": "hybrid",
|
||||
"max_chunks": 3,
|
||||
"score_threshold": 0.7,
|
||||
"ranker": {"type": "rrf", "impact_factor": 60.0},
|
||||
},
|
||||
)
|
||||
|
||||
# Using weighted ranker
|
||||
response = await vector_io.query_chunks(
|
||||
vector_db_id="my_db",
|
||||
query="your query here",
|
||||
params={
|
||||
"mode": "hybrid",
|
||||
"max_chunks": 3,
|
||||
"score_threshold": 0.7,
|
||||
"ranker": {"type": "weighted", "alpha": 0.7}, # 70% vector, 30% keyword
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
Example with explicit vector search:
|
||||
```python
|
||||
response = await vector_io.query_chunks(
|
||||
vector_db_id="my_db",
|
||||
query="your query here",
|
||||
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
|
||||
)
|
||||
```
|
||||
|
||||
Example with keyword search:
|
||||
```python
|
||||
response = await vector_io.query_chunks(
|
||||
vector_db_id="my_db",
|
||||
query="your query here",
|
||||
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
|
||||
)
|
||||
```
|
||||
|
||||
## Supported Search Modes
|
||||
|
||||
The SQLite vector store supports three search modes:
|
||||
|
||||
1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks
|
||||
2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks
|
||||
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker
|
||||
|
||||
### Hybrid Search
|
||||
|
||||
Hybrid search combines the strengths of both vector and keyword search by:
|
||||
- Computing vector similarity scores
|
||||
- Computing keyword match scores
|
||||
- Using a ranker to combine these scores
|
||||
|
||||
Two ranker types are supported:
|
||||
|
||||
1. **RRF (Reciprocal Rank Fusion)**:
|
||||
- Combines ranks from both vector and keyword results
|
||||
- Uses an impact factor (default: 60.0) to control the weight of higher-ranked results
|
||||
- Good for balancing between vector and keyword results
|
||||
- The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks
|
||||
|
||||
2. **Weighted**:
|
||||
- Linearly combines normalized vector and keyword scores
|
||||
- Uses an alpha parameter (0-1) to control the blend:
|
||||
- alpha=0: Only use keyword scores
|
||||
- alpha=1: Only use vector scores
|
||||
- alpha=0.5: Equal weight to both (default)
|
||||
|
||||
Example using RAGQueryConfig with different search modes:
|
||||
|
||||
```python
|
||||
from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
|
||||
|
||||
# Vector search
|
||||
config = RAGQueryConfig(mode="vector", max_chunks=5)
|
||||
|
||||
# Keyword search
|
||||
config = RAGQueryConfig(mode="keyword", max_chunks=5)
|
||||
|
||||
# Hybrid search with custom RRF ranker
|
||||
config = RAGQueryConfig(
|
||||
mode="hybrid",
|
||||
max_chunks=5,
|
||||
ranker=RRFRanker(impact_factor=50.0), # Custom impact factor
|
||||
)
|
||||
|
||||
# Hybrid search with weighted ranker
|
||||
config = RAGQueryConfig(
|
||||
mode="hybrid",
|
||||
max_chunks=5,
|
||||
ranker=WeightedRanker(alpha=0.7), # 70% vector, 30% keyword
|
||||
)
|
||||
|
||||
# Hybrid search with default RRF ranker
|
||||
config = RAGQueryConfig(
|
||||
mode="hybrid", max_chunks=5
|
||||
) # Will use RRF with impact_factor=60.0
|
||||
```
|
||||
|
||||
Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored.
|
||||
|
||||
## Installation
|
||||
|
||||
You can install SQLite-Vec using pip:
|
||||
|
||||
```bash
|
||||
pip install sqlite-vec
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
|
||||
|
||||
[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759).
|
||||
sidebar_label: Sqlite-Vec
|
||||
title: inline::sqlite-vec
|
||||
---
|
||||
|
||||
|
@ -9,10 +204,205 @@ title: inline::sqlite-vec
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
[SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It
|
||||
allows you to store and query vectors directly within an SQLite database.
|
||||
That means you're not limited to storing vectors in memory or in a separate service.
|
||||
|
||||
## Features
|
||||
|
||||
- Lightweight and easy to use
|
||||
- Fully integrated with Llama Stacks
|
||||
- Uses disk-based storage for persistence, allowing for larger vector storage
|
||||
|
||||
### Comparison to Faiss
|
||||
|
||||
The choice between Faiss and sqlite-vec should be made based on the needs of your application,
|
||||
as they have different strengths.
|
||||
|
||||
#### Choosing the Right Provider
|
||||
|
||||
Scenario | Recommended Tool | Reason
|
||||
-- |-----------------| --
|
||||
Online Analytical Processing (OLAP) | Faiss | Fast, in-memory searches
|
||||
Online Transaction Processing (OLTP) | sqlite-vec | Frequent writes and reads
|
||||
Frequent writes | sqlite-vec | Efficient disk-based storage and incremental indexing
|
||||
Large datasets | sqlite-vec | Disk-based storage for larger vector storage
|
||||
Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
|
||||
|
||||
#### Empirical Example
|
||||
|
||||
Consider the histogram below in which 10,000 randomly generated strings were inserted
|
||||
in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
|
||||
|
||||
```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
|
||||
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
You will notice that the average write time for `sqlite-vec` was 788ms, compared to
|
||||
47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
|
||||
uniformly spread across the [1500, 100000] interval.
|
||||
|
||||
Looking at each individual write in the order that the documents are inserted you'll see the increase in
|
||||
write speed as Faiss reindexes the vectors after each write.
|
||||
```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
|
||||
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
|
||||
The modes of the two distributions highlight the differences much further where Faiss
|
||||
will likely yield faster read performance.
|
||||
|
||||
```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
|
||||
:alt: Comparison of SQLite-Vec and Faiss read times
|
||||
:width: 400px
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
To use sqlite-vec in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use SQLite-Vec.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
The SQLite-vec provider supports three search modes:
|
||||
|
||||
1. **Vector Search** (`mode="vector"`): Performs pure vector similarity search using the embeddings.
|
||||
2. **Keyword Search** (`mode="keyword"`): Performs full-text search using SQLite's FTS5.
|
||||
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword search for better results. First performs keyword search to get candidate matches, then applies vector similarity search on those candidates.
|
||||
|
||||
Example with hybrid search:
|
||||
```python
|
||||
response = await vector_io.query_chunks(
|
||||
vector_db_id="my_db",
|
||||
query="your query here",
|
||||
params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
|
||||
)
|
||||
|
||||
# Using RRF ranker
|
||||
response = await vector_io.query_chunks(
|
||||
vector_db_id="my_db",
|
||||
query="your query here",
|
||||
params={
|
||||
"mode": "hybrid",
|
||||
"max_chunks": 3,
|
||||
"score_threshold": 0.7,
|
||||
"ranker": {"type": "rrf", "impact_factor": 60.0},
|
||||
},
|
||||
)
|
||||
|
||||
# Using weighted ranker
|
||||
response = await vector_io.query_chunks(
|
||||
vector_db_id="my_db",
|
||||
query="your query here",
|
||||
params={
|
||||
"mode": "hybrid",
|
||||
"max_chunks": 3,
|
||||
"score_threshold": 0.7,
|
||||
"ranker": {"type": "weighted", "alpha": 0.7}, # 70% vector, 30% keyword
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
Example with explicit vector search:
|
||||
```python
|
||||
response = await vector_io.query_chunks(
|
||||
vector_db_id="my_db",
|
||||
query="your query here",
|
||||
params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
|
||||
)
|
||||
```
|
||||
|
||||
Example with keyword search:
|
||||
```python
|
||||
response = await vector_io.query_chunks(
|
||||
vector_db_id="my_db",
|
||||
query="your query here",
|
||||
params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
|
||||
)
|
||||
```
|
||||
|
||||
## Supported Search Modes
|
||||
|
||||
The SQLite vector store supports three search modes:
|
||||
|
||||
1. **Vector Search** (`mode="vector"`): Uses vector similarity to find relevant chunks
|
||||
2. **Keyword Search** (`mode="keyword"`): Uses keyword matching to find relevant chunks
|
||||
3. **Hybrid Search** (`mode="hybrid"`): Combines both vector and keyword scores using a ranker
|
||||
|
||||
### Hybrid Search
|
||||
|
||||
Hybrid search combines the strengths of both vector and keyword search by:
|
||||
- Computing vector similarity scores
|
||||
- Computing keyword match scores
|
||||
- Using a ranker to combine these scores
|
||||
|
||||
Two ranker types are supported:
|
||||
|
||||
1. **RRF (Reciprocal Rank Fusion)**:
|
||||
- Combines ranks from both vector and keyword results
|
||||
- Uses an impact factor (default: 60.0) to control the weight of higher-ranked results
|
||||
- Good for balancing between vector and keyword results
|
||||
- The default impact factor of 60.0 comes from the original RRF paper by Cormack et al. (2009) [^1], which found this value to provide optimal performance across various retrieval tasks
|
||||
|
||||
2. **Weighted**:
|
||||
- Linearly combines normalized vector and keyword scores
|
||||
- Uses an alpha parameter (0-1) to control the blend:
|
||||
- alpha=0: Only use keyword scores
|
||||
- alpha=1: Only use vector scores
|
||||
- alpha=0.5: Equal weight to both (default)
|
||||
|
||||
Example using RAGQueryConfig with different search modes:
|
||||
|
||||
```python
|
||||
from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
|
||||
|
||||
# Vector search
|
||||
config = RAGQueryConfig(mode="vector", max_chunks=5)
|
||||
|
||||
# Keyword search
|
||||
config = RAGQueryConfig(mode="keyword", max_chunks=5)
|
||||
|
||||
# Hybrid search with custom RRF ranker
|
||||
config = RAGQueryConfig(
|
||||
mode="hybrid",
|
||||
max_chunks=5,
|
||||
ranker=RRFRanker(impact_factor=50.0), # Custom impact factor
|
||||
)
|
||||
|
||||
# Hybrid search with weighted ranker
|
||||
config = RAGQueryConfig(
|
||||
mode="hybrid",
|
||||
max_chunks=5,
|
||||
ranker=WeightedRanker(alpha=0.7), # 70% vector, 30% keyword
|
||||
)
|
||||
|
||||
# Hybrid search with default RRF ranker
|
||||
config = RAGQueryConfig(
|
||||
mode="hybrid", max_chunks=5
|
||||
) # Will use RRF with impact_factor=60.0
|
||||
```
|
||||
|
||||
Note: The ranker configuration is only used in hybrid mode. For vector or keyword modes, the ranker parameter is ignored.
|
||||
|
||||
## Installation
|
||||
|
||||
You can install SQLite-Vec using pip:
|
||||
|
||||
```bash
|
||||
pip install sqlite-vec
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general.
|
||||
|
||||
[^1]: Cormack, G. V., Clarke, C. L., & Buettcher, S. (2009). [Reciprocal rank fusion outperforms condorcet and individual rank learning methods](https://dl.acm.org/doi/10.1145/1571941.1572114). In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval (pp. 758-759).
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|
|
35
docs/docs/providers/vector_io/inline_sqlite_vec.mdx
Normal file
35
docs/docs/providers/vector_io/inline_sqlite_vec.mdx
Normal file
|
@ -0,0 +1,35 @@
|
|||
---
|
||||
description: "Please refer to the sqlite-vec provider documentation."
|
||||
sidebar_label: Sqlite Vec
|
||||
title: inline::sqlite_vec
|
||||
---
|
||||
|
||||
# inline::sqlite_vec
|
||||
|
||||
## Description
|
||||
|
||||
|
||||
Please refer to the sqlite-vec provider documentation.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `db_path` | `<class 'str'>` | No | | Path to the SQLite database file |
|
||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
|
||||
kvstore:
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db
|
||||
```
|
||||
## Deprecation Notice
|
||||
|
||||
:::warning
|
||||
Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
|
||||
:::
|
||||
|
|
@ -1,7 +1,37 @@
|
|||
---
|
||||
description: '[Chroma](https://www'
|
||||
sidebar_label: Chromadb
|
||||
sidebar_position: 9
|
||||
description: |
|
||||
[Chroma](https://www.trychroma.com/) is an inline and remote vector
|
||||
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
|
||||
That means you're not limited to storing vectors in memory or in a separate service.
|
||||
|
||||
## Features
|
||||
Chroma supports:
|
||||
- Store embeddings and their metadata
|
||||
- Vector search
|
||||
- Full-text search
|
||||
- Document storage
|
||||
- Metadata filtering
|
||||
- Multi-modal retrieval
|
||||
|
||||
## Usage
|
||||
|
||||
To use Chrome in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use chroma.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
You can install chroma using pip:
|
||||
|
||||
```bash
|
||||
pip install chromadb
|
||||
```
|
||||
|
||||
## Documentation
|
||||
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
|
||||
sidebar_label: Remote - Chromadb
|
||||
title: remote::chromadb
|
||||
---
|
||||
|
||||
|
@ -9,10 +39,40 @@ title: remote::chromadb
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
[Chroma](https://www.trychroma.com/) is an inline and remote vector
|
||||
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
|
||||
That means you're not limited to storing vectors in memory or in a separate service.
|
||||
|
||||
## Features
|
||||
Chroma supports:
|
||||
- Store embeddings and their metadata
|
||||
- Vector search
|
||||
- Full-text search
|
||||
- Document storage
|
||||
- Metadata filtering
|
||||
- Multi-modal retrieval
|
||||
|
||||
## Usage
|
||||
|
||||
To use Chrome in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use chroma.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
You can install chroma using pip:
|
||||
|
||||
```bash
|
||||
pip install chromadb
|
||||
```
|
||||
|
||||
## Documentation
|
||||
See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|
|
|
@ -1,7 +1,201 @@
|
|||
---
|
||||
description: '[Milvus](https://milvus'
|
||||
sidebar_label: Milvus
|
||||
sidebar_position: 10
|
||||
description: |
|
||||
[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
|
||||
allows you to store and query vectors directly within a Milvus database.
|
||||
That means you're not limited to storing vectors in memory or in a separate service.
|
||||
|
||||
## Features
|
||||
|
||||
- Easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
|
||||
|
||||
## Usage
|
||||
|
||||
To use Milvus in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use Milvus.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
If you want to use inline Milvus, you can install:
|
||||
|
||||
```bash
|
||||
pip install pymilvus[milvus-lite]
|
||||
```
|
||||
|
||||
If you want to use remote Milvus, you can install:
|
||||
|
||||
```bash
|
||||
pip install pymilvus
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
In Llama Stack, Milvus can be configured in two ways:
|
||||
- **Inline (Local) Configuration** - Uses Milvus-Lite for local storage
|
||||
- **Remote Configuration** - Connects to a remote Milvus server
|
||||
|
||||
### Inline (Local) Configuration
|
||||
|
||||
The simplest method is local configuration, which requires setting `db_path`, a path for locally storing Milvus-Lite files:
|
||||
|
||||
```yaml
|
||||
vector_io:
|
||||
- provider_id: milvus
|
||||
provider_type: inline::milvus
|
||||
config:
|
||||
db_path: ~/.llama/distributions/together/milvus_store.db
|
||||
```
|
||||
|
||||
### Remote Configuration
|
||||
|
||||
Remote configuration is suitable for larger data storage requirements:
|
||||
|
||||
#### Standard Remote Connection
|
||||
|
||||
```yaml
|
||||
vector_io:
|
||||
- provider_id: milvus
|
||||
provider_type: remote::milvus
|
||||
config:
|
||||
uri: "http://<host>:<port>"
|
||||
token: "<user>:<password>"
|
||||
```
|
||||
|
||||
#### TLS-Enabled Remote Connection (One-way TLS)
|
||||
|
||||
For connections to Milvus instances with one-way TLS enabled:
|
||||
|
||||
```yaml
|
||||
vector_io:
|
||||
- provider_id: milvus
|
||||
provider_type: remote::milvus
|
||||
config:
|
||||
uri: "https://<host>:<port>"
|
||||
token: "<user>:<password>"
|
||||
secure: True
|
||||
server_pem_path: "/path/to/server.pem"
|
||||
```
|
||||
|
||||
#### Mutual TLS (mTLS) Remote Connection
|
||||
|
||||
For connections to Milvus instances with mutual TLS (mTLS) enabled:
|
||||
|
||||
```yaml
|
||||
vector_io:
|
||||
- provider_id: milvus
|
||||
provider_type: remote::milvus
|
||||
config:
|
||||
uri: "https://<host>:<port>"
|
||||
token: "<user>:<password>"
|
||||
secure: True
|
||||
ca_pem_path: "/path/to/ca.pem"
|
||||
client_pem_path: "/path/to/client.pem"
|
||||
client_key_path: "/path/to/client.key"
|
||||
```
|
||||
|
||||
#### Key Parameters for TLS Configuration
|
||||
|
||||
- **`secure`**: Enables TLS encryption when set to `true`. Defaults to `false`.
|
||||
- **`server_pem_path`**: Path to the **server certificate** for verifying the server's identity (used in one-way TLS).
|
||||
- **`ca_pem_path`**: Path to the **Certificate Authority (CA) certificate** for validating the server certificate (required in mTLS).
|
||||
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
|
||||
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
|
||||
|
||||
## Search Modes
|
||||
|
||||
Milvus supports three different search modes for both inline and remote configurations:
|
||||
|
||||
### Vector Search
|
||||
Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
|
||||
|
||||
```python
|
||||
# Vector search example
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="What is machine learning?",
|
||||
search_mode="vector",
|
||||
max_num_results=5,
|
||||
)
|
||||
```
|
||||
|
||||
### Keyword Search
|
||||
Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
|
||||
|
||||
```python
|
||||
# Keyword search example
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="Python programming language",
|
||||
search_mode="keyword",
|
||||
max_num_results=5,
|
||||
)
|
||||
```
|
||||
|
||||
### Hybrid Search
|
||||
Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
|
||||
|
||||
#### Basic Hybrid Search
|
||||
```python
|
||||
# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="neural networks in Python",
|
||||
search_mode="hybrid",
|
||||
max_num_results=5,
|
||||
)
|
||||
```
|
||||
|
||||
**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
|
||||
|
||||
#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
|
||||
RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
|
||||
|
||||
```python
|
||||
# Hybrid search with custom RRF parameters
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="neural networks in Python",
|
||||
search_mode="hybrid",
|
||||
max_num_results=5,
|
||||
ranking_options={
|
||||
"ranker": {
|
||||
"type": "rrf",
|
||||
"impact_factor": 100.0, # Higher values give more weight to top-ranked results
|
||||
}
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
#### Hybrid Search with Weighted Ranker
|
||||
Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
|
||||
|
||||
```python
|
||||
# Hybrid search with weighted ranker
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="neural networks in Python",
|
||||
search_mode="hybrid",
|
||||
max_num_results=5,
|
||||
ranking_options={
|
||||
"ranker": {
|
||||
"type": "weighted",
|
||||
"alpha": 0.7, # 70% vector search, 30% keyword search
|
||||
}
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
|
||||
|
||||
## Documentation
|
||||
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
|
||||
|
||||
For more details on TLS configuration, refer to the [TLS setup guide](https://milvus.io/docs/tls.md).
|
||||
sidebar_label: Remote - Milvus
|
||||
title: remote::milvus
|
||||
---
|
||||
|
||||
|
@ -9,10 +203,39 @@ title: remote::milvus
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
|
||||
allows you to store and query vectors directly within a Milvus database.
|
||||
That means you're not limited to storing vectors in memory or in a separate service.
|
||||
|
||||
## Features
|
||||
|
||||
- Easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
|
||||
|
||||
## Usage
|
||||
|
||||
To use Milvus in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use Milvus.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
If you want to use inline Milvus, you can install:
|
||||
|
||||
```bash
|
||||
pip install pymilvus[milvus-lite]
|
||||
```
|
||||
|
||||
If you want to use remote Milvus, you can install:
|
||||
|
||||
```bash
|
||||
pip install pymilvus
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
In Llama Stack, Milvus can be configured in two ways:
|
||||
|
@ -86,6 +309,112 @@ vector_io:
|
|||
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
|
||||
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
|
||||
|
||||
## Search Modes
|
||||
|
||||
Milvus supports three different search modes for both inline and remote configurations:
|
||||
|
||||
### Vector Search
|
||||
Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
|
||||
|
||||
```python
|
||||
# Vector search example
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="What is machine learning?",
|
||||
search_mode="vector",
|
||||
max_num_results=5,
|
||||
)
|
||||
```
|
||||
|
||||
### Keyword Search
|
||||
Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
|
||||
|
||||
```python
|
||||
# Keyword search example
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="Python programming language",
|
||||
search_mode="keyword",
|
||||
max_num_results=5,
|
||||
)
|
||||
```
|
||||
|
||||
### Hybrid Search
|
||||
Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
|
||||
|
||||
#### Basic Hybrid Search
|
||||
```python
|
||||
# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="neural networks in Python",
|
||||
search_mode="hybrid",
|
||||
max_num_results=5,
|
||||
)
|
||||
```
|
||||
|
||||
**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
|
||||
|
||||
#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
|
||||
RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
|
||||
|
||||
```python
|
||||
# Hybrid search with custom RRF parameters
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="neural networks in Python",
|
||||
search_mode="hybrid",
|
||||
max_num_results=5,
|
||||
ranking_options={
|
||||
"ranker": {
|
||||
"type": "rrf",
|
||||
"impact_factor": 100.0, # Higher values give more weight to top-ranked results
|
||||
}
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
#### Hybrid Search with Weighted Ranker
|
||||
Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
|
||||
|
||||
```python
|
||||
# Hybrid search with weighted ranker
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="neural networks in Python",
|
||||
search_mode="hybrid",
|
||||
max_num_results=5,
|
||||
ranking_options={
|
||||
"ranker": {
|
||||
"type": "weighted",
|
||||
"alpha": 0.7, # 70% vector search, 30% keyword search
|
||||
}
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
|
||||
|
||||
## Documentation
|
||||
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
|
||||
|
||||
For more details on TLS configuration, refer to the [TLS setup guide](https://milvus.io/docs/tls.md).
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|-------|------|----------|---------|-------------|
|
||||
| `uri` | `<class 'str'>` | No | | The URI of the Milvus server |
|
||||
| `token` | `str \| None` | No | | The token of the Milvus server |
|
||||
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
|
||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
|
||||
| `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
|
||||
|
||||
:::note
|
||||
This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
|
||||
:::
|
||||
|
||||
## Sample Configuration
|
||||
|
||||
```yaml
|
||||
|
|
|
@ -1,7 +1,105 @@
|
|||
---
|
||||
description: '[PGVector](https://github'
|
||||
sidebar_label: Pgvector
|
||||
sidebar_position: 11
|
||||
description: |
|
||||
[PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It
|
||||
allows you to store and query vectors directly in memory.
|
||||
That means you'll get fast and efficient vector retrieval.
|
||||
|
||||
## Features
|
||||
|
||||
- Easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
|
||||
There are three implementations of search for PGVectoIndex available:
|
||||
|
||||
1. Vector Search:
|
||||
- How it works:
|
||||
- Uses PostgreSQL's vector extension (pgvector) to perform similarity search
|
||||
- Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
|
||||
- Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
|
||||
|
||||
-Characteristics:
|
||||
- Semantic understanding - finds documents similar in meaning even if they don't share keywords
|
||||
- Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
|
||||
- Best for: Finding conceptually related content, handling synonyms, cross-language search
|
||||
|
||||
2. Keyword Search
|
||||
- How it works:
|
||||
- Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
|
||||
- Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
|
||||
- Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
|
||||
|
||||
- Characteristics:
|
||||
- Lexical matching - finds exact keyword matches and variations
|
||||
- Uses GIN (Generalized Inverted Index) for fast text search performance
|
||||
- Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
|
||||
- Best for: Exact term matching, proper names, technical terms, Boolean-style queries
|
||||
|
||||
3. Hybrid Search
|
||||
- How it works:
|
||||
- Combines both vector and keyword search results
|
||||
- Runs both searches independently, then merges results using configurable reranking
|
||||
|
||||
- Two reranking strategies available:
|
||||
- Reciprocal Rank Fusion (RRF) - (default: 60.0)
|
||||
- Weighted Average - (default: 0.5)
|
||||
|
||||
- Characteristics:
|
||||
- Best of both worlds: semantic understanding + exact matching
|
||||
- Documents appearing in both searches get boosted scores
|
||||
- Configurable balance between semantic and lexical matching
|
||||
- Best for: General-purpose search where you want both precision and recall
|
||||
|
||||
4. Database Schema
|
||||
The PGVector implementation stores data optimized for all three search types:
|
||||
CREATE TABLE vector_store_xxx (
|
||||
id TEXT PRIMARY KEY,
|
||||
document JSONB, -- Original document
|
||||
embedding vector(dimension), -- For vector search
|
||||
content_text TEXT, -- Raw text content
|
||||
tokenized_content TSVECTOR -- For keyword search
|
||||
);
|
||||
|
||||
-- Indexes for performance
|
||||
CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content); -- Keyword search
|
||||
-- Vector index created automatically by pgvector
|
||||
|
||||
## Usage
|
||||
|
||||
To use PGVector in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## This is an example how you can set up your environment for using PGVector
|
||||
|
||||
1. Export env vars:
|
||||
```bash
|
||||
export ENABLE_PGVECTOR=true
|
||||
export PGVECTOR_HOST=localhost
|
||||
export PGVECTOR_PORT=5432
|
||||
export PGVECTOR_DB=llamastack
|
||||
export PGVECTOR_USER=llamastack
|
||||
export PGVECTOR_PASSWORD=llamastack
|
||||
```
|
||||
|
||||
2. Create DB:
|
||||
```bash
|
||||
psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
|
||||
psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
|
||||
psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
You can install PGVector using docker:
|
||||
|
||||
```bash
|
||||
docker pull pgvector/pgvector:pg17
|
||||
```
|
||||
## Documentation
|
||||
See [PGVector's documentation](https://github.com/pgvector/pgvector) for more details about PGVector in general.
|
||||
sidebar_label: Remote - Pgvector
|
||||
title: remote::pgvector
|
||||
---
|
||||
|
||||
|
@ -9,10 +107,108 @@ title: remote::pgvector
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
[PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It
|
||||
allows you to store and query vectors directly in memory.
|
||||
That means you'll get fast and efficient vector retrieval.
|
||||
|
||||
## Features
|
||||
|
||||
- Easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
|
||||
There are three implementations of search for PGVectoIndex available:
|
||||
|
||||
1. Vector Search:
|
||||
- How it works:
|
||||
- Uses PostgreSQL's vector extension (pgvector) to perform similarity search
|
||||
- Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
|
||||
- Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
|
||||
|
||||
-Characteristics:
|
||||
- Semantic understanding - finds documents similar in meaning even if they don't share keywords
|
||||
- Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
|
||||
- Best for: Finding conceptually related content, handling synonyms, cross-language search
|
||||
|
||||
2. Keyword Search
|
||||
- How it works:
|
||||
- Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
|
||||
- Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
|
||||
- Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
|
||||
|
||||
- Characteristics:
|
||||
- Lexical matching - finds exact keyword matches and variations
|
||||
- Uses GIN (Generalized Inverted Index) for fast text search performance
|
||||
- Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
|
||||
- Best for: Exact term matching, proper names, technical terms, Boolean-style queries
|
||||
|
||||
3. Hybrid Search
|
||||
- How it works:
|
||||
- Combines both vector and keyword search results
|
||||
- Runs both searches independently, then merges results using configurable reranking
|
||||
|
||||
- Two reranking strategies available:
|
||||
- Reciprocal Rank Fusion (RRF) - (default: 60.0)
|
||||
- Weighted Average - (default: 0.5)
|
||||
|
||||
- Characteristics:
|
||||
- Best of both worlds: semantic understanding + exact matching
|
||||
- Documents appearing in both searches get boosted scores
|
||||
- Configurable balance between semantic and lexical matching
|
||||
- Best for: General-purpose search where you want both precision and recall
|
||||
|
||||
4. Database Schema
|
||||
The PGVector implementation stores data optimized for all three search types:
|
||||
CREATE TABLE vector_store_xxx (
|
||||
id TEXT PRIMARY KEY,
|
||||
document JSONB, -- Original document
|
||||
embedding vector(dimension), -- For vector search
|
||||
content_text TEXT, -- Raw text content
|
||||
tokenized_content TSVECTOR -- For keyword search
|
||||
);
|
||||
|
||||
-- Indexes for performance
|
||||
CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content); -- Keyword search
|
||||
-- Vector index created automatically by pgvector
|
||||
|
||||
## Usage
|
||||
|
||||
To use PGVector in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## This is an example how you can set up your environment for using PGVector
|
||||
|
||||
1. Export env vars:
|
||||
```bash
|
||||
export ENABLE_PGVECTOR=true
|
||||
export PGVECTOR_HOST=localhost
|
||||
export PGVECTOR_PORT=5432
|
||||
export PGVECTOR_DB=llamastack
|
||||
export PGVECTOR_USER=llamastack
|
||||
export PGVECTOR_PASSWORD=llamastack
|
||||
```
|
||||
|
||||
2. Create DB:
|
||||
```bash
|
||||
psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
|
||||
psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
|
||||
psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
You can install PGVector using docker:
|
||||
|
||||
```bash
|
||||
docker pull pgvector/pgvector:pg17
|
||||
```
|
||||
## Documentation
|
||||
See [PGVector's documentation](https://github.com/pgvector/pgvector) for more details about PGVector in general.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
---
|
||||
description: Please refer to the inline provider documentation
|
||||
sidebar_label: Qdrant
|
||||
sidebar_position: 12
|
||||
description: "Please refer to the inline provider documentation."
|
||||
sidebar_label: Remote - Qdrant
|
||||
title: remote::qdrant
|
||||
---
|
||||
|
||||
|
@ -9,8 +8,10 @@ title: remote::qdrant
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
Please refer to the inline provider documentation.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|
|
|
@ -1,7 +1,35 @@
|
|||
---
|
||||
description: '[Weaviate](https://weaviate'
|
||||
sidebar_label: Weaviate
|
||||
sidebar_position: 13
|
||||
description: |
|
||||
[Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack.
|
||||
It allows you to store and query vectors directly within a Weaviate database.
|
||||
That means you're not limited to storing vectors in memory or in a separate service.
|
||||
|
||||
## Features
|
||||
Weaviate supports:
|
||||
- Store embeddings and their metadata
|
||||
- Vector search
|
||||
- Full-text search
|
||||
- Hybrid search
|
||||
- Document storage
|
||||
- Metadata filtering
|
||||
- Multi-modal retrieval
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
To use Weaviate in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use chroma.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
To install Weaviate see the [Weaviate quickstart documentation](https://weaviate.io/developers/weaviate/quickstart).
|
||||
|
||||
## Documentation
|
||||
See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general.
|
||||
sidebar_label: Remote - Weaviate
|
||||
title: remote::weaviate
|
||||
---
|
||||
|
||||
|
@ -9,10 +37,38 @@ title: remote::weaviate
|
|||
|
||||
## Description
|
||||
|
||||
|
||||
[Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack.
|
||||
It allows you to store and query vectors directly within a Weaviate database.
|
||||
That means you're not limited to storing vectors in memory or in a separate service.
|
||||
|
||||
## Features
|
||||
Weaviate supports:
|
||||
- Store embeddings and their metadata
|
||||
- Vector search
|
||||
- Full-text search
|
||||
- Hybrid search
|
||||
- Document storage
|
||||
- Metadata filtering
|
||||
- Multi-modal retrieval
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
To use Weaviate in your Llama Stack project, follow these steps:
|
||||
|
||||
1. Install the necessary dependencies.
|
||||
2. Configure your Llama Stack project to use chroma.
|
||||
3. Start storing and querying vectors.
|
||||
|
||||
## Installation
|
||||
|
||||
To install Weaviate see the [Weaviate quickstart documentation](https://weaviate.io/developers/weaviate/quickstart).
|
||||
|
||||
## Documentation
|
||||
See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general.
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
| Field | Type | Required | Default | Description |
|
||||
|
|
|
@ -0,0 +1,125 @@
|
|||
---
|
||||
orphan: true
|
||||
---
|
||||
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
|
||||
# Meta Reference GPU Distribution
|
||||
|
||||
```{toctree}
|
||||
:maxdepth: 2
|
||||
:hidden:
|
||||
|
||||
self
|
||||
```
|
||||
|
||||
The `llamastack/distribution-meta-reference-gpu` distribution consists of the following provider configurations:
|
||||
|
||||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `remote::huggingface`, `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `inline::meta-reference` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
|
||||
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
|
||||
|
||||
Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
||||
- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
|
||||
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
|
||||
- `SAFETY_CHECKPOINT_DIR`: Directory containing the Llama-Guard model checkpoint (default: `null`)
|
||||
|
||||
|
||||
## Prerequisite: Downloading Models
|
||||
|
||||
Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
|
||||
|
||||
```
|
||||
$ llama model list --downloaded
|
||||
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
|
||||
┃ Model ┃ Size ┃ Modified Time ┃
|
||||
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
|
||||
│ Llama3.2-1B-Instruct:int4-qlora-eo8 │ 1.53 GB │ 2025-02-26 11:22:28 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama3.2-1B │ 2.31 GB │ 2025-02-18 21:48:52 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Prompt-Guard-86M │ 0.02 GB │ 2025-02-26 11:29:28 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB │ 2025-02-26 11:37:41 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama3.2-3B │ 5.99 GB │ 2025-02-18 21:51:26 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama3.1-8B │ 14.97 GB │ 2025-02-16 10:36:37 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB │ 2025-02-26 11:35:02 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama-Guard-3-1B │ 2.80 GB │ 2025-02-26 11:20:46 │
|
||||
├─────────────────────────────────────────┼──────────┼─────────────────────┤
|
||||
│ Llama-Guard-3-1B:int4 │ 0.43 GB │ 2025-02-26 11:33:33 │
|
||||
└─────────────────────────────────────────┴──────────┴─────────────────────┘
|
||||
```
|
||||
|
||||
## Running the Distribution
|
||||
|
||||
You can do this via venv or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=8321
|
||||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
--gpu all \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-meta-reference-gpu \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
--gpu all \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
llamastack/distribution-meta-reference-gpu \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
```
|
||||
|
||||
### Via venv
|
||||
|
||||
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
|
||||
|
||||
```bash
|
||||
llama stack build --distro meta-reference-gpu --image-type venv
|
||||
llama stack run distributions/meta-reference-gpu/run.yaml \
|
||||
--port 8321 \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
|
||||
--port 8321 \
|
||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
```
|
171
docs/source/distributions/self_hosted_distro/nvidia.md
Normal file
171
docs/source/distributions/self_hosted_distro/nvidia.md
Normal file
|
@ -0,0 +1,171 @@
|
|||
---
|
||||
orphan: true
|
||||
---
|
||||
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
|
||||
# NVIDIA Distribution
|
||||
|
||||
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
|
||||
|
||||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `inline::localfs`, `remote::nvidia` |
|
||||
| eval | `remote::nvidia` |
|
||||
| files | `inline::localfs` |
|
||||
| inference | `remote::nvidia` |
|
||||
| post_training | `remote::nvidia` |
|
||||
| safety | `remote::nvidia` |
|
||||
| scoring | `inline::basic` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `inline::rag-runtime` |
|
||||
| vector_io | `inline::faiss` |
|
||||
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
||||
- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
|
||||
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
|
||||
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
|
||||
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
||||
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
||||
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
||||
- `NVIDIA_GUARDRAILS_CONFIG_ID`: NVIDIA Guardrail Configuration ID (default: `self-check`)
|
||||
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
|
||||
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
||||
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
||||
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
- `meta/llama3-8b-instruct `
|
||||
- `meta/llama3-70b-instruct `
|
||||
- `meta/llama-3.1-8b-instruct `
|
||||
- `meta/llama-3.1-70b-instruct `
|
||||
- `meta/llama-3.1-405b-instruct `
|
||||
- `meta/llama-3.2-1b-instruct `
|
||||
- `meta/llama-3.2-3b-instruct `
|
||||
- `meta/llama-3.2-11b-vision-instruct `
|
||||
- `meta/llama-3.2-90b-vision-instruct `
|
||||
- `meta/llama-3.3-70b-instruct `
|
||||
- `nvidia/vila `
|
||||
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
||||
- `nvidia/nv-embedqa-e5-v5 `
|
||||
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
||||
- `snowflake/arctic-embed-l `
|
||||
|
||||
|
||||
## Prerequisites
|
||||
### NVIDIA API Keys
|
||||
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
||||
|
||||
### Deploy NeMo Microservices Platform
|
||||
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||
|
||||
## Supported Services
|
||||
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
||||
|
||||
### Inference: NVIDIA NIM
|
||||
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
||||
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
||||
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
||||
|
||||
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
||||
|
||||
### Datasetio API: NeMo Data Store
|
||||
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
||||
|
||||
See the {repopath}`NVIDIA Datasetio docs::llama_stack/providers/remote/datasetio/nvidia/README.md` for supported features and example usage.
|
||||
|
||||
### Eval API: NeMo Evaluator
|
||||
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the {repopath}`NVIDIA Eval docs::llama_stack/providers/remote/eval/nvidia/README.md` for supported features and example usage.
|
||||
|
||||
### Post-Training API: NeMo Customizer
|
||||
The NeMo Customizer microservice supports fine-tuning models. You can reference {repopath}`this list of supported models::llama_stack/providers/remote/post_training/nvidia/models.py` that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the {repopath}`NVIDIA Post-Training docs::llama_stack/providers/remote/post_training/nvidia/README.md` for supported features and example usage.
|
||||
|
||||
### Safety API: NeMo Guardrails
|
||||
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the {repopath}`NVIDIA Safety docs::llama_stack/providers/remote/safety/nvidia/README.md` for supported features and example usage.
|
||||
|
||||
## Deploying models
|
||||
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
||||
|
||||
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
||||
```sh
|
||||
# URL to NeMo NIM Proxy service
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"name": "llama-3.2-1b-instruct",
|
||||
"namespace": "meta",
|
||||
"config": {
|
||||
"model": "meta/llama-3.2-1b-instruct",
|
||||
"nim_deployment": {
|
||||
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
||||
"image_tag": "1.8.3",
|
||||
"pvc_size": "25Gi",
|
||||
"gpu": 1,
|
||||
"additional_envs": {
|
||||
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
||||
}
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
|
||||
|
||||
You can also remove a deployed NIM to free up GPU resources, if needed.
|
||||
```sh
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
||||
```
|
||||
|
||||
## Running Llama Stack with NVIDIA
|
||||
|
||||
You can do this via venv (build code), or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=8321
|
||||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-nvidia \
|
||||
--config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
```
|
||||
|
||||
### Via venv
|
||||
|
||||
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||
llama stack build --distro nvidia --image-type venv
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
## Example Notebooks
|
||||
For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in {repopath}`docs/notebooks/nvidia`.
|
|
@ -10,11 +10,11 @@ import sys
|
|||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from llama_stack.core.distribution import get_provider_registry
|
||||
|
||||
from pydantic_core import PydanticUndefined
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||
|
||||
from llama_stack.core.distribution import get_provider_registry
|
||||
|
||||
REPO_ROOT = Path(__file__).parent.parent
|
||||
|
||||
|
||||
|
@ -22,9 +22,7 @@ def get_api_docstring(api_name: str) -> str | None:
|
|||
"""Extract docstring from the API protocol class."""
|
||||
try:
|
||||
# Import the API module dynamically
|
||||
api_module = __import__(
|
||||
f"llama_stack.apis.{api_name}", fromlist=[api_name.title()]
|
||||
)
|
||||
api_module = __import__(f"llama_stack.apis.{api_name}", fromlist=[api_name.title()])
|
||||
|
||||
# Get the main protocol class (usually capitalized API name)
|
||||
protocol_class_name = api_name.title()
|
||||
|
@ -72,10 +70,7 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
|
|||
model_config = config_class.model_config
|
||||
if hasattr(model_config, "extra") and model_config.extra == "allow":
|
||||
accepts_extra_config = True
|
||||
elif (
|
||||
isinstance(model_config, dict)
|
||||
and model_config.get("extra") == "allow"
|
||||
):
|
||||
elif isinstance(model_config, dict) and model_config.get("extra") == "allow":
|
||||
accepts_extra_config = True
|
||||
|
||||
fields_info = {}
|
||||
|
@ -84,19 +79,9 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
|
|||
field_type = str(field.annotation) if field.annotation else "Any"
|
||||
|
||||
# this string replace is ridiculous
|
||||
field_type = (
|
||||
field_type.replace("typing.", "")
|
||||
.replace("Optional[", "")
|
||||
.replace("]", "")
|
||||
)
|
||||
field_type = (
|
||||
field_type.replace("Annotated[", "")
|
||||
.replace("FieldInfo(", "")
|
||||
.replace(")", "")
|
||||
)
|
||||
field_type = field_type.replace(
|
||||
"llama_stack.apis.inference.inference.", ""
|
||||
)
|
||||
field_type = field_type.replace("typing.", "").replace("Optional[", "").replace("]", "")
|
||||
field_type = field_type.replace("Annotated[", "").replace("FieldInfo(", "").replace(")", "")
|
||||
field_type = field_type.replace("llama_stack.apis.inference.inference.", "")
|
||||
field_type = field_type.replace("llama_stack.providers.", "")
|
||||
|
||||
default_value = field.default
|
||||
|
@ -106,10 +91,7 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
|
|||
# HACK ALERT:
|
||||
# If the default value contains a path that looks like it came from RUNTIME_BASE_DIR,
|
||||
# replace it with a generic ~/.llama/ path for documentation
|
||||
if (
|
||||
isinstance(default_value, str)
|
||||
and "/.llama/" in default_value
|
||||
):
|
||||
if isinstance(default_value, str) and "/.llama/" in default_value:
|
||||
if ".llama/" in default_value:
|
||||
path_part = default_value.split(".llama/")[-1]
|
||||
default_value = f"~/.llama/{path_part}"
|
||||
|
@ -135,11 +117,7 @@ def get_config_class_info(config_class_path: str) -> dict[str, Any]:
|
|||
lines = source.split("\n")
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if (
|
||||
"model_config" in line
|
||||
and "ConfigDict" in line
|
||||
and 'extra="allow"' in line
|
||||
):
|
||||
if "model_config" in line and "ConfigDict" in line and 'extra="allow"' in line:
|
||||
comments = []
|
||||
for j in range(i - 1, -1, -1):
|
||||
stripped = lines[j].strip()
|
||||
|
@ -204,9 +182,7 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
|
|||
# Create sidebar label (clean up provider_type for display)
|
||||
sidebar_label = provider_type.replace("::", " - ").replace("_", " ")
|
||||
if sidebar_label.startswith("inline - "):
|
||||
sidebar_label = sidebar_label[
|
||||
9:
|
||||
].title() # Remove "inline - " prefix and title case
|
||||
sidebar_label = sidebar_label[9:].title() # Remove "inline - " prefix and title case
|
||||
else:
|
||||
sidebar_label = sidebar_label.title()
|
||||
|
||||
|
@ -219,7 +195,8 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
|
|||
if "\n" in description.strip():
|
||||
md_lines.append("description: |")
|
||||
for line in description.strip().split("\n"):
|
||||
md_lines.append(f" {line}")
|
||||
# Avoid trailing whitespace by only adding spaces to non-empty lines
|
||||
md_lines.append(f" {line}" if line.strip() else "")
|
||||
else:
|
||||
# For single line descriptions, format properly for YAML
|
||||
clean_desc = description.strip().replace('"', '\\"')
|
||||
|
@ -248,14 +225,10 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
|
|||
for field_name, field_info in config_info["fields"].items():
|
||||
field_type = field_info["type"].replace("|", "\\|")
|
||||
required = "Yes" if field_info["required"] else "No"
|
||||
default = (
|
||||
str(field_info["default"]) if field_info["default"] is not None else ""
|
||||
)
|
||||
default = str(field_info["default"]) if field_info["default"] is not None else ""
|
||||
description_text = field_info["description"] or ""
|
||||
|
||||
md_lines.append(
|
||||
f"| `{field_name}` | `{field_type}` | {required} | {default} | {description_text} |"
|
||||
)
|
||||
md_lines.append(f"| `{field_name}` | `{field_type}` | {required} | {default} | {description_text} |")
|
||||
|
||||
md_lines.append("")
|
||||
|
||||
|
@ -297,22 +270,16 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
|
|||
return obj
|
||||
|
||||
sample_config_dict = convert_pydantic_to_dict(sample_config)
|
||||
md_lines.append(
|
||||
yaml.dump(
|
||||
sample_config_dict, default_flow_style=False, sort_keys=False
|
||||
)
|
||||
)
|
||||
# Strip trailing newlines from yaml.dump to prevent extra blank lines
|
||||
yaml_output = yaml.dump(sample_config_dict, default_flow_style=False, sort_keys=False).rstrip()
|
||||
md_lines.append(yaml_output)
|
||||
else:
|
||||
md_lines.append("# No sample configuration available.")
|
||||
except Exception as e:
|
||||
md_lines.append(f"# Error generating sample config: {str(e)}")
|
||||
md_lines.append("```")
|
||||
md_lines.append("")
|
||||
|
||||
if (
|
||||
hasattr(provider_spec, "deprecation_warning")
|
||||
and provider_spec.deprecation_warning
|
||||
):
|
||||
if hasattr(provider_spec, "deprecation_warning") and provider_spec.deprecation_warning:
|
||||
md_lines.append("## Deprecation Notice")
|
||||
md_lines.append("")
|
||||
md_lines.append(":::warning")
|
||||
|
@ -330,9 +297,7 @@ def generate_provider_docs(progress, provider_spec: Any, api_name: str) -> str:
|
|||
return "\n".join(md_lines) + "\n"
|
||||
|
||||
|
||||
def generate_index_docs(
|
||||
api_name: str, api_docstring: str | None, provider_entries: list
|
||||
) -> str:
|
||||
def generate_index_docs(api_name: str, api_docstring: str | None, provider_entries: list) -> str:
|
||||
"""Generate MDX documentation for the index file."""
|
||||
# Create sidebar label for the API
|
||||
sidebar_label = api_name.replace("_", " ").title()
|
||||
|
@ -360,9 +325,7 @@ def generate_index_docs(
|
|||
md_lines.append(f"{cleaned_docstring}")
|
||||
md_lines.append("")
|
||||
|
||||
md_lines.append(
|
||||
f"This section contains documentation for all available providers for the **{api_name}** API."
|
||||
)
|
||||
md_lines.append(f"This section contains documentation for all available providers for the **{api_name}** API.")
|
||||
md_lines.append("")
|
||||
|
||||
md_lines.append("## Providers")
|
||||
|
@ -373,9 +336,8 @@ def generate_index_docs(
|
|||
provider_name = entry["display_name"]
|
||||
filename = entry["filename"]
|
||||
md_lines.append(f"- [{provider_name}](./{filename})")
|
||||
md_lines.append("")
|
||||
|
||||
return "\n".join(md_lines)
|
||||
return "\n".join(md_lines) + "\n"
|
||||
|
||||
|
||||
def process_provider_registry(progress, change_tracker: ChangedPathTracker) -> None:
|
||||
|
@ -411,14 +373,10 @@ def process_provider_registry(progress, change_tracker: ChangedPathTracker) -> N
|
|||
else:
|
||||
display_name = display_name.title()
|
||||
|
||||
provider_entries.append(
|
||||
{"filename": filename, "display_name": display_name}
|
||||
)
|
||||
provider_entries.append({"filename": filename, "display_name": display_name})
|
||||
|
||||
# Generate index file with frontmatter
|
||||
index_content = generate_index_docs(
|
||||
api_name, api_docstring, provider_entries
|
||||
)
|
||||
index_content = generate_index_docs(api_name, api_docstring, provider_entries)
|
||||
index_file = doc_output_dir / "index.mdx"
|
||||
index_file.write_text(index_content)
|
||||
change_tracker.add_paths(index_file)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue