diff --git a/docs/source/distributions/self_hosted_distro/nvidia.md b/docs/source/distributions/self_hosted_distro/nvidia.md index 6e399e6ce..e845c3c48 100644 --- a/docs/source/distributions/self_hosted_distro/nvidia.md +++ b/docs/source/distributions/self_hosted_distro/nvidia.md @@ -157,7 +157,7 @@ docker run \ If you've set up your local development environment, you can also build the image using your local virtual environment. ```bash -INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct +INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct llama stack build --distro nvidia --image-type venv llama stack run ./run.yaml \ --port 8321 \ diff --git a/docs/source/providers/vector_io/inline_faiss.md b/docs/source/providers/vector_io/inline_faiss.md index bcff66f3f..cfa18a839 100644 --- a/docs/source/providers/vector_io/inline_faiss.md +++ b/docs/source/providers/vector_io/inline_faiss.md @@ -12,6 +12,18 @@ That means you'll get fast and efficient vector retrieval. - Lightweight and easy to use - Fully integrated with Llama Stack - GPU support +- **Vector search** - FAISS supports pure vector similarity search using embeddings + +## Search Modes + +**Supported:** +- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings + +**Not Supported:** +- **Keyword Search** (`mode="keyword"`): Not supported by FAISS +- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS + +> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality. ## Usage diff --git a/docs/source/providers/vector_io/remote_milvus.md b/docs/source/providers/vector_io/remote_milvus.md index 3646f4acc..2af64b8bb 100644 --- a/docs/source/providers/vector_io/remote_milvus.md +++ b/docs/source/providers/vector_io/remote_milvus.md @@ -11,6 +11,7 @@ That means you're not limited to storing vectors in memory or in a separate serv - Easy to use - Fully integrated with Llama Stack +- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations) ## Usage @@ -101,6 +102,92 @@ vector_io: - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS). - **`client_key_path`**: Path to the **client private key** file (required for mTLS). +## Search Modes + +Milvus supports three different search modes for both inline and remote configurations: + +### Vector Search +Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content. + +```python +# Vector search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="What is machine learning?", + search_mode="vector", + max_num_results=5, +) +``` + +### Keyword Search +Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches. + +```python +# Keyword search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="Python programming language", + search_mode="keyword", + max_num_results=5, +) +``` + +### Hybrid Search +Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching. + +#### Basic Hybrid Search +```python +# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0) +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, +) +``` + +**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009). + +#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker +RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results. + +```python +# Hybrid search with custom RRF parameters +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "rrf", + "impact_factor": 100.0, # Higher values give more weight to top-ranked results + } + }, +) +``` + +#### Hybrid Search with Weighted Ranker +Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods. + +```python +# Hybrid search with weighted ranker +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "weighted", + "alpha": 0.7, # 70% vector search, 30% keyword search + } + }, +) +``` + +For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md). + ## Documentation See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general. diff --git a/llama_stack/distributions/nvidia/doc_template.md b/llama_stack/distributions/nvidia/doc_template.md index 3884e6b51..56e99e523 100644 --- a/llama_stack/distributions/nvidia/doc_template.md +++ b/llama_stack/distributions/nvidia/doc_template.md @@ -129,7 +129,7 @@ docker run \ If you've set up your local development environment, you can also build the image using your local virtual environment. ```bash -INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct +INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct llama stack build --distro nvidia --image-type venv llama stack run ./run.yaml \ --port 8321 \ diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py index 7a5373726..5a063592c 100644 --- a/llama_stack/providers/inline/vector_io/faiss/faiss.py +++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py @@ -174,7 +174,9 @@ class FaissIndex(EmbeddingIndex): k: int, score_threshold: float, ) -> QueryChunksResponse: - raise NotImplementedError("Keyword search is not supported in FAISS") + raise NotImplementedError( + "Keyword search is not supported - underlying DB FAISS does not support this search mode" + ) async def query_hybrid( self, @@ -185,7 +187,9 @@ class FaissIndex(EmbeddingIndex): reranker_type: str, reranker_params: dict[str, Any] | None = None, ) -> QueryChunksResponse: - raise NotImplementedError("Hybrid search is not supported in FAISS") + raise NotImplementedError( + "Hybrid search is not supported - underlying DB FAISS does not support this search mode" + ) class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate): diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py index 846f7b88e..ed170b508 100644 --- a/llama_stack/providers/registry/vector_io.py +++ b/llama_stack/providers/registry/vector_io.py @@ -45,6 +45,18 @@ That means you'll get fast and efficient vector retrieval. - Lightweight and easy to use - Fully integrated with Llama Stack - GPU support +- **Vector search** - FAISS supports pure vector similarity search using embeddings + +## Search Modes + +**Supported:** +- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings + +**Not Supported:** +- **Keyword Search** (`mode="keyword"`): Not supported by FAISS +- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS + +> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality. ## Usage @@ -535,6 +547,7 @@ That means you're not limited to storing vectors in memory or in a separate serv - Easy to use - Fully integrated with Llama Stack +- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations) ## Usage @@ -625,6 +638,92 @@ vector_io: - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS). - **`client_key_path`**: Path to the **client private key** file (required for mTLS). +## Search Modes + +Milvus supports three different search modes for both inline and remote configurations: + +### Vector Search +Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content. + +```python +# Vector search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="What is machine learning?", + search_mode="vector", + max_num_results=5, +) +``` + +### Keyword Search +Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches. + +```python +# Keyword search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="Python programming language", + search_mode="keyword", + max_num_results=5, +) +``` + +### Hybrid Search +Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching. + +#### Basic Hybrid Search +```python +# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0) +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, +) +``` + +**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009). + +#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker +RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results. + +```python +# Hybrid search with custom RRF parameters +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "rrf", + "impact_factor": 100.0, # Higher values give more weight to top-ranked results + } + }, +) +``` + +#### Hybrid Search with Weighted Ranker +Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods. + +```python +# Hybrid search with weighted ranker +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "weighted", + "alpha": 0.7, # 70% vector search, 30% keyword search + } + }, +) +``` + +For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md). + ## Documentation See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general. diff --git a/llama_stack/providers/remote/inference/gemini/models.py b/llama_stack/providers/remote/inference/gemini/models.py index 6fda35e0f..bd696b0ac 100644 --- a/llama_stack/providers/remote/inference/gemini/models.py +++ b/llama_stack/providers/remote/inference/gemini/models.py @@ -13,7 +13,9 @@ LLM_MODEL_IDS = [ "gemini-1.5-flash", "gemini-1.5-pro", "gemini-2.0-flash", + "gemini-2.0-flash-lite", "gemini-2.5-flash", + "gemini-2.5-flash-lite", "gemini-2.5-pro", ] diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md index 2505718e0..4a072215c 100644 --- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md +++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md @@ -42,8 +42,8 @@ client.initialize() ### Create Completion ```python -response = client.completion( - model_id="meta-llama/Llama-3.1-8b-Instruct", +response = client.inference.completion( + model_id="meta-llama/Llama-3.1-8B-Instruct", content="Complete the sentence using one word: Roses are red, violets are :", stream=False, sampling_params={ @@ -56,8 +56,8 @@ print(f"Response: {response.content}") ### Create Chat Completion ```python -response = client.chat_completion( - model_id="meta-llama/Llama-3.1-8b-Instruct", +response = client.inference.chat_completion( + model_id="meta-llama/Llama-3.1-8B-Instruct", messages=[ { "role": "system", @@ -78,8 +78,10 @@ print(f"Response: {response.completion_message.content}") ### Create Embeddings ```python -response = client.embeddings( - model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"] +response = client.inference.embeddings( + model_id="nvidia/llama-3.2-nv-embedqa-1b-v2", + contents=["What is the capital of France?"], + task_type="query", ) print(f"Embeddings: {response.embeddings}") -``` +``` \ No newline at end of file diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py index 75b29cdce..7080e774a 100644 --- a/llama_stack/providers/utils/telemetry/tracing.py +++ b/llama_stack/providers/utils/telemetry/tracing.py @@ -9,7 +9,9 @@ import contextvars import logging import queue import random +import sys import threading +import time from collections.abc import Callable from datetime import UTC, datetime from functools import wraps @@ -30,6 +32,16 @@ from llama_stack.providers.utils.telemetry.trace_protocol import serialize_value logger = get_logger(__name__, category="core") +# Fallback logger that does NOT propagate to TelemetryHandler to avoid recursion +_fallback_logger = logging.getLogger("llama_stack.telemetry.background") +if not _fallback_logger.handlers: + _fallback_logger.propagate = False + _fallback_logger.setLevel(logging.ERROR) + _fallback_handler = logging.StreamHandler(sys.stderr) + _fallback_handler.setLevel(logging.ERROR) + _fallback_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s")) + _fallback_logger.addHandler(_fallback_handler) + INVALID_SPAN_ID = 0x0000000000000000 INVALID_TRACE_ID = 0x00000000000000000000000000000000 @@ -79,19 +91,32 @@ def generate_trace_id() -> str: CURRENT_TRACE_CONTEXT = contextvars.ContextVar("trace_context", default=None) BACKGROUND_LOGGER = None +LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS = 60.0 + class BackgroundLogger: def __init__(self, api: Telemetry, capacity: int = 100000): self.api = api - self.log_queue = queue.Queue(maxsize=capacity) + self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity) self.worker_thread = threading.Thread(target=self._process_logs, daemon=True) self.worker_thread.start() + self._last_queue_full_log_time: float = 0.0 + self._dropped_since_last_notice: int = 0 def log_event(self, event): try: self.log_queue.put_nowait(event) except queue.Full: - logger.error("Log queue is full, dropping event") + # Aggregate drops and emit at most once per interval via fallback logger + self._dropped_since_last_notice += 1 + current_time = time.time() + if current_time - self._last_queue_full_log_time >= LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS: + _fallback_logger.error( + "Log queue is full; dropped %d events since last notice", + self._dropped_since_last_notice, + ) + self._last_queue_full_log_time = current_time + self._dropped_since_last_notice = 0 def _process_logs(self): while True: diff --git a/llama_stack/ui/app/logs/vector-stores/[id]/files/[fileId]/contents/[contentId]/page.tsx b/llama_stack/ui/app/logs/vector-stores/[id]/files/[fileId]/contents/[contentId]/page.tsx new file mode 100644 index 000000000..6896b992a --- /dev/null +++ b/llama_stack/ui/app/logs/vector-stores/[id]/files/[fileId]/contents/[contentId]/page.tsx @@ -0,0 +1,383 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { useParams, useRouter } from "next/navigation"; +import { useAuthClient } from "@/hooks/use-auth-client"; +import { ContentsAPI, VectorStoreContentItem } from "@/lib/contents-api"; +import type { VectorStore } from "llama-stack-client/resources/vector-stores/vector-stores"; +import type { VectorStoreFile } from "llama-stack-client/resources/vector-stores/files"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; +import { Edit, Save, X, Trash2 } from "lucide-react"; +import { + DetailLoadingView, + DetailErrorView, + DetailNotFoundView, + DetailLayout, + PropertiesCard, + PropertyItem, +} from "@/components/layout/detail-layout"; +import { PageBreadcrumb, BreadcrumbSegment } from "@/components/layout/page-breadcrumb"; + +export default function ContentDetailPage() { + const params = useParams(); + const router = useRouter(); + const vectorStoreId = params.id as string; + const fileId = params.fileId as string; + const contentId = params.contentId as string; + const client = useAuthClient(); + + const getTextFromContent = (content: any): string => { + if (typeof content === 'string') { + return content; + } else if (content && content.type === 'text') { + return content.text; + } + return ''; + }; + + const [store, setStore] = useState(null); + const [file, setFile] = useState(null); + const [content, setContent] = useState(null); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + const [isEditing, setIsEditing] = useState(false); + const [editedContent, setEditedContent] = useState(""); + const [editedMetadata, setEditedMetadata] = useState>({}); + const [isEditingEmbedding, setIsEditingEmbedding] = useState(false); + const [editedEmbedding, setEditedEmbedding] = useState([]); + + useEffect(() => { + if (!vectorStoreId || !fileId || !contentId) return; + + const fetchData = async () => { + setIsLoading(true); + setError(null); + try { + const [storeResponse, fileResponse] = await Promise.all([ + client.vectorStores.retrieve(vectorStoreId), + client.vectorStores.files.retrieve(vectorStoreId, fileId), + ]); + + setStore(storeResponse as VectorStore); + setFile(fileResponse as VectorStoreFile); + + const contentsAPI = new ContentsAPI(client); + const contentsResponse = await contentsAPI.listContents(vectorStoreId, fileId); + const targetContent = contentsResponse.data.find(c => c.id === contentId); + + if (targetContent) { + setContent(targetContent); + setEditedContent(getTextFromContent(targetContent.content)); + setEditedMetadata({ ...targetContent.metadata }); + setEditedEmbedding(targetContent.embedding || []); + } else { + throw new Error(`Content ${contentId} not found`); + } + } catch (err) { + setError(err instanceof Error ? err : new Error("Failed to load content.")); + } finally { + setIsLoading(false); + } + }; + fetchData(); + }, [vectorStoreId, fileId, contentId, client]); + + const handleSave = async () => { + if (!content) return; + + try { + const updates: { content?: string; metadata?: Record } = {}; + + if (editedContent !== getTextFromContent(content.content)) { + updates.content = editedContent; + } + + if (JSON.stringify(editedMetadata) !== JSON.stringify(content.metadata)) { + updates.metadata = editedMetadata; + } + + if (Object.keys(updates).length > 0) { + const contentsAPI = new ContentsAPI(client); + const updatedContent = await contentsAPI.updateContent(vectorStoreId, fileId, contentId, updates); + setContent(updatedContent); + } + + setIsEditing(false); + } catch (err) { + console.error('Failed to update content:', err); + } + }; + + const handleDelete = async () => { + if (!confirm('Are you sure you want to delete this content?')) return; + + try { + const contentsAPI = new ContentsAPI(client); + await contentsAPI.deleteContent(vectorStoreId, fileId, contentId); + router.push(`/logs/vector-stores/${vectorStoreId}/files/${fileId}/contents`); + } catch (err) { + console.error('Failed to delete content:', err); + } + }; + + const handleCancel = () => { + setEditedContent(content ? getTextFromContent(content.content) : ""); + setEditedMetadata({ ...content?.metadata }); + setEditedEmbedding(content?.embedding || []); + setIsEditing(false); + setIsEditingEmbedding(false); + }; + + const title = `Content: ${contentId}`; + + const breadcrumbSegments: BreadcrumbSegment[] = [ + { label: "Vector Stores", href: "/logs/vector-stores" }, + { label: store?.name || vectorStoreId, href: `/logs/vector-stores/${vectorStoreId}` }, + { label: "Files", href: `/logs/vector-stores/${vectorStoreId}` }, + { label: fileId, href: `/logs/vector-stores/${vectorStoreId}/files/${fileId}` }, + { label: "Contents", href: `/logs/vector-stores/${vectorStoreId}/files/${fileId}/contents` }, + { label: contentId }, + ]; + + if (error) { + return ; + } + if (isLoading) { + return ; + } + if (!content) { + return ; + } + + const mainContent = ( + <> + + + Content +
+ {isEditing ? ( + <> + + + + ) : ( + <> + + + + )} +
+
+ + {isEditing ? ( +