Merge branch 'main' into chroma

2025-12-04 10:10:36 +00:00 · 2025-08-11 10:06:35 +09:00 · 2025-08-11 10:06:35 +09:00 · d460fd64b4
commit d460fd64b4
parent 7690de9490 69dc789e15
17 changed files with 1419 additions and 88 deletions
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -174,7 +174,9 @@ class FaissIndex(EmbeddingIndex):
        k: int,
        score_threshold: float,
    ) -> QueryChunksResponse:
-        raise NotImplementedError("Keyword search is not supported in FAISS")
+        raise NotImplementedError(
+            "Keyword search is not supported - underlying DB FAISS does not support this search mode"
+        )

    async def query_hybrid(
        self,
@ -185,7 +187,9 @@ class FaissIndex(EmbeddingIndex):
        reranker_type: str,
        reranker_params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
-        raise NotImplementedError("Hybrid search is not supported in FAISS")
+        raise NotImplementedError(
+            "Hybrid search is not supported - underlying DB FAISS does not support this search mode"
+        )


 class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@ -45,6 +45,18 @@ That means you'll get fast and efficient vector retrieval.
 - Lightweight and easy to use
 - Fully integrated with Llama Stack
 - GPU support
+- **Vector search** - FAISS supports pure vector similarity search using embeddings
+
+## Search Modes
+
+**Supported:**
+- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
+
+**Not Supported:**
+- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
+- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
+
+> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.

 ## Usage

@ -535,6 +547,7 @@ That means you're not limited to storing vectors in memory or in a separate serv

 - Easy to use
 - Fully integrated with Llama Stack
+- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)

 ## Usage

@ -625,6 +638,92 @@ vector_io:
 - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
 - **`client_key_path`**: Path to the **client private key** file (required for mTLS).

+## Search Modes
+
+Milvus supports three different search modes for both inline and remote configurations:
+
+### Vector Search
+Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
+
+```python
+# Vector search example
+search_response = client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="What is machine learning?",
+    search_mode="vector",
+    max_num_results=5,
+)
+```
+
+### Keyword Search
+Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
+
+```python
+# Keyword search example
+search_response = client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="Python programming language",
+    search_mode="keyword",
+    max_num_results=5,
+)
+```
+
+### Hybrid Search
+Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
+
+#### Basic Hybrid Search
+```python
+# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
+search_response = client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="neural networks in Python",
+    search_mode="hybrid",
+    max_num_results=5,
+)
+```
+
+**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
+
+#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
+RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
+
+```python
+# Hybrid search with custom RRF parameters
+search_response = client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="neural networks in Python",
+    search_mode="hybrid",
+    max_num_results=5,
+    ranking_options={
+        "ranker": {
+            "type": "rrf",
+            "impact_factor": 100.0,  # Higher values give more weight to top-ranked results
+        }
+    },
+)
+```
+
+#### Hybrid Search with Weighted Ranker
+Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
+
+```python
+# Hybrid search with weighted ranker
+search_response = client.vector_stores.search(
+    vector_store_id=vector_store.id,
+    query="neural networks in Python",
+    search_mode="hybrid",
+    max_num_results=5,
+    ranking_options={
+        "ranker": {
+            "type": "weighted",
+            "alpha": 0.7,  # 70% vector search, 30% keyword search
+        }
+    },
+)
+```
+
+For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
+
 ## Documentation
 See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.

--- a/llama_stack/providers/remote/inference/gemini/models.py
+++ b/llama_stack/providers/remote/inference/gemini/models.py
@ -13,7 +13,9 @@ LLM_MODEL_IDS = [
    "gemini-1.5-flash",
    "gemini-1.5-pro",
    "gemini-2.0-flash",
+    "gemini-2.0-flash-lite",
    "gemini-2.5-flash",
+    "gemini-2.5-flash-lite",
    "gemini-2.5-pro",
 ]

--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -42,8 +42,8 @@ client.initialize()
 ### Create Completion

 ```python
-response = client.completion(
-    model_id="meta-llama/Llama-3.1-8b-Instruct",
+response = client.inference.completion(
+    model_id="meta-llama/Llama-3.1-8B-Instruct",
    content="Complete the sentence using one word: Roses are red, violets are :",
    stream=False,
    sampling_params={
@ -56,8 +56,8 @@ print(f"Response: {response.content}")
 ### Create Chat Completion

 ```python
-response = client.chat_completion(
-    model_id="meta-llama/Llama-3.1-8b-Instruct",
+response = client.inference.chat_completion(
+    model_id="meta-llama/Llama-3.1-8B-Instruct",
    messages=[
        {
            "role": "system",
@ -78,8 +78,10 @@ print(f"Response: {response.completion_message.content}")

 ### Create Embeddings
 ```python
-response = client.embeddings(
-    model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"]
+response = client.inference.embeddings(
+    model_id="nvidia/llama-3.2-nv-embedqa-1b-v2",
+    contents=["What is the capital of France?"],
+    task_type="query",
 )
 print(f"Embeddings: {response.embeddings}")
-```
+```
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@ -9,7 +9,9 @@ import contextvars
 import logging
 import queue
 import random
+import sys
 import threading
+import time
 from collections.abc import Callable
 from datetime import UTC, datetime
 from functools import wraps
@ -30,6 +32,16 @@ from llama_stack.providers.utils.telemetry.trace_protocol import serialize_value

 logger = get_logger(__name__, category="core")

+# Fallback logger that does NOT propagate to TelemetryHandler to avoid recursion
+_fallback_logger = logging.getLogger("llama_stack.telemetry.background")
+if not _fallback_logger.handlers:
+    _fallback_logger.propagate = False
+    _fallback_logger.setLevel(logging.ERROR)
+    _fallback_handler = logging.StreamHandler(sys.stderr)
+    _fallback_handler.setLevel(logging.ERROR)
+    _fallback_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
+    _fallback_logger.addHandler(_fallback_handler)
+

 INVALID_SPAN_ID = 0x0000000000000000
 INVALID_TRACE_ID = 0x00000000000000000000000000000000
@ -79,19 +91,32 @@ def generate_trace_id() -> str:
 CURRENT_TRACE_CONTEXT = contextvars.ContextVar("trace_context", default=None)
 BACKGROUND_LOGGER = None

+LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS = 60.0
+

 class BackgroundLogger:
    def __init__(self, api: Telemetry, capacity: int = 100000):
        self.api = api
-        self.log_queue = queue.Queue(maxsize=capacity)
+        self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
        self.worker_thread = threading.Thread(target=self._process_logs, daemon=True)
        self.worker_thread.start()
+        self._last_queue_full_log_time: float = 0.0
+        self._dropped_since_last_notice: int = 0

    def log_event(self, event):
        try:
            self.log_queue.put_nowait(event)
        except queue.Full:
-            logger.error("Log queue is full, dropping event")
+            # Aggregate drops and emit at most once per interval via fallback logger
+            self._dropped_since_last_notice += 1
+            current_time = time.time()
+            if current_time - self._last_queue_full_log_time >= LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS:
+                _fallback_logger.error(
+                    "Log queue is full; dropped %d events since last notice",
+                    self._dropped_since_last_notice,
+                )
+                self._last_queue_full_log_time = current_time
+                self._dropped_since_last_notice = 0

    def _process_logs(self):
        while True: