Merge branch 'main' into vector-store-chunks

2025-10-09 13:14:39 +00:00 · 2025-08-03 12:55:14 -04:00 · 2025-08-03 12:55:14 -04:00 · 4e986e9caf
commit 4e986e9caf
parent 24865ea42d dbfc15123e
51 changed files with 2180 additions and 2294 deletions
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -160,8 +160,11 @@ class FaissIndex(EmbeddingIndex):
        for d, i in zip(distances[0], indices[0], strict=False):
            if i < 0:
                continue
+            score = 1.0 / float(d) if d != 0 else float("inf")
+            if score < score_threshold:
+                continue
            chunks.append(self.chunk_by_index[int(i)])
-            scores.append(1.0 / float(d) if d != 0 else float("inf"))
+            scores.append(score)

        return QueryChunksResponse(chunks=chunks, scores=scores)

--- a/llama_stack/providers/remote/datasetio/nvidia/README.md
+++ b/llama_stack/providers/remote/datasetio/nvidia/README.md
@ -20,7 +20,7 @@ This provider enables dataset management using NVIDIA's NeMo Customizer service.
 Build the NVIDIA environment:

 ```bash
-llama stack build --template nvidia --image-type conda
+llama stack build --template nvidia --image-type venv
 ```

 ### Basic Usage using the LlamaStack Python Client
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -18,7 +18,7 @@ This provider enables running inference using NVIDIA NIM.
 Build the NVIDIA environment:

 ```bash
-llama stack build --template nvidia --image-type conda
+llama stack build --template nvidia --image-type venv
 ```

 ### Basic Usage using the LlamaStack Python Client
--- a/llama_stack/providers/remote/post_training/nvidia/README.md
+++ b/llama_stack/providers/remote/post_training/nvidia/README.md
@ -22,7 +22,7 @@ This provider enables fine-tuning of LLMs using NVIDIA's NeMo Customizer service
 Build the NVIDIA environment:

 ```bash
-llama stack build --template nvidia --image-type conda
+llama stack build --template nvidia --image-type venv
 ```

 ### Basic Usage using the LlamaStack Python Client
--- a/llama_stack/providers/remote/safety/nvidia/README.md
+++ b/llama_stack/providers/remote/safety/nvidia/README.md
@ -19,7 +19,7 @@ This provider enables safety checks and guardrails for LLM interactions using NV
 Build the NVIDIA environment:

 ```bash
-llama stack build --template nvidia --image-type conda
+llama stack build --template nvidia --image-type venv
 ```

 ### Basic Usage using the LlamaStack Python Client
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@ -132,8 +132,11 @@ class PGVectorIndex(EmbeddingIndex):
            chunks = []
            scores = []
            for doc, dist in results:
+                score = 1.0 / float(dist) if dist != 0 else float("inf")
+                if score < score_threshold:
+                    continue
                chunks.append(Chunk(**doc))
-                scores.append(1.0 / float(dist) if dist != 0 else float("inf"))
+                scores.append(score)

            return QueryChunksResponse(chunks=chunks, scores=scores)

--- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
@ -105,8 +105,12 @@ class WeaviateIndex(EmbeddingIndex):
                log.exception(f"Failed to parse document: {chunk_json}")
                continue

+            score = 1.0 / doc.metadata.distance if doc.metadata.distance != 0 else float("inf")
+            if score < score_threshold:
+                continue
+
            chunks.append(chunk)
-            scores.append(1.0 / doc.metadata.distance if doc.metadata.distance != 0 else float("inf"))
+            scores.append(score)

        return QueryChunksResponse(chunks=chunks, scores=scores)

--- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@ -444,10 +444,6 @@ class OpenAIVectorStoreMixin(ABC):
            # Convert response to OpenAI format
            data = []
            for chunk, score in zip(response.chunks, response.scores, strict=False):
-                # Apply score based filtering
-                if score < score_threshold:
-                    continue
-
                # Apply filters if provided
                if filters:
                    # Simple metadata filtering