chore(package): migrate to src/ layout (#3920)

Migrates package structure to src/ layout following Python packaging best practices. All code moved from `llama_stack/` to `src/llama_stack/`. Public API unchanged - imports remain `import llama_stack.*`. Updated build configs, pre-commit hooks, scripts, and GitHub workflows accordingly. All hooks pass, package builds cleanly. **Developer note**: Reinstall after pulling: `pip install -e .`
2025-12-03 18:00:36 +00:00 · 2025-10-27 12:02:21 -07:00 · 2025-10-27 12:02:21 -07:00 · 471b1b248b
commit 471b1b248b
parent 98a5047f9d
791 changed files with 2983 additions and 456 deletions
--- a/llama_stack/providers/remote/vector_io/init.py
+++ b/llama_stack/providers/remote/vector_io/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/remote/vector_io/chroma/init.py
+++ b/llama_stack/providers/remote/vector_io/chroma/init.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.providers.datatypes import Api, ProviderSpec
-
-from .config import ChromaVectorIOConfig
-
-
-async def get_adapter_impl(config: ChromaVectorIOConfig, deps: dict[Api, ProviderSpec]):
-    from .chroma import ChromaVectorIOAdapter
-
-    impl = ChromaVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files))
-    await impl.initialize()
-    return impl
--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@ -1,209 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import asyncio
-import json
-from typing import Any
-from urllib.parse import urlparse
-
-import chromadb
-from numpy.typing import NDArray
-
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference, InterleavedContent
-from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import VectorStoresProtocolPrivate
-from llama_stack.providers.inline.vector_io.chroma import ChromaVectorIOConfig as InlineChromaVectorIOConfig
-from llama_stack.providers.utils.kvstore import kvstore_impl
-from llama_stack.providers.utils.kvstore.api import KVStore
-from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
-from llama_stack.providers.utils.memory.vector_store import ChunkForDeletion, EmbeddingIndex, VectorStoreWithIndex
-
-from .config import ChromaVectorIOConfig as RemoteChromaVectorIOConfig
-
-log = get_logger(name=__name__, category="vector_io::chroma")
-
-ChromaClientType = chromadb.api.AsyncClientAPI | chromadb.api.ClientAPI
-
-VERSION = "v3"
-VECTOR_DBS_PREFIX = f"vector_stores:chroma:{VERSION}::"
-VECTOR_INDEX_PREFIX = f"vector_index:chroma:{VERSION}::"
-OPENAI_VECTOR_STORES_PREFIX = f"openai_vector_stores:chroma:{VERSION}::"
-OPENAI_VECTOR_STORES_FILES_PREFIX = f"openai_vector_stores_files:chroma:{VERSION}::"
-OPENAI_VECTOR_STORES_FILES_CONTENTS_PREFIX = f"openai_vector_stores_files_contents:chroma:{VERSION}::"
-
-
-# this is a helper to allow us to use async and non-async chroma clients interchangeably
-async def maybe_await(result):
-    if asyncio.iscoroutine(result):
-        return await result
-    return result
-
-
-class ChromaIndex(EmbeddingIndex):
-    def __init__(self, client: ChromaClientType, collection, kvstore: KVStore | None = None):
-        self.client = client
-        self.collection = collection
-        self.kvstore = kvstore
-
-    async def initialize(self):
-        pass
-
-    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
-        assert len(chunks) == len(embeddings), (
-            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
-        )
-
-        ids = [f"{c.metadata.get('document_id', '')}:{c.chunk_id}" for c in chunks]
-        await maybe_await(
-            self.collection.add(documents=[chunk.model_dump_json() for chunk in chunks], embeddings=embeddings, ids=ids)
-        )
-
-    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
-        results = await maybe_await(
-            self.collection.query(
-                query_embeddings=[embedding.tolist()], n_results=k, include=["documents", "distances"]
-            )
-        )
-        distances = results["distances"][0]
-        documents = results["documents"][0]
-
-        chunks = []
-        scores = []
-        for dist, doc in zip(distances, documents, strict=False):
-            try:
-                doc = json.loads(doc)
-                chunk = Chunk(**doc)
-            except Exception:
-                log.exception(f"Failed to parse document: {doc}")
-                continue
-
-            score = 1.0 / float(dist) if dist != 0 else float("inf")
-            if score < score_threshold:
-                continue
-
-            chunks.append(chunk)
-            scores.append(score)
-
-        return QueryChunksResponse(chunks=chunks, scores=scores)
-
-    async def delete(self):
-        await maybe_await(self.client.delete_collection(self.collection.name))
-
-    async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
-        raise NotImplementedError("Keyword search is not supported in Chroma")
-
-    async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None:
-        """Delete a single chunk from the Chroma collection by its ID."""
-        ids = [f"{chunk.document_id}:{chunk.chunk_id}" for chunk in chunks_for_deletion]
-        await maybe_await(self.collection.delete(ids=ids))
-
-    async def query_hybrid(
-        self,
-        embedding: NDArray,
-        query_string: str,
-        k: int,
-        score_threshold: float,
-        reranker_type: str,
-        reranker_params: dict[str, Any] | None = None,
-    ) -> QueryChunksResponse:
-        raise NotImplementedError("Hybrid search is not supported in Chroma")
-
-
-class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
-    def __init__(
-        self,
-        config: RemoteChromaVectorIOConfig | InlineChromaVectorIOConfig,
-        inference_api: Inference,
-        files_api: Files | None,
-    ) -> None:
-        super().__init__(files_api=files_api, kvstore=None)
-        log.info(f"Initializing ChromaVectorIOAdapter with url: {config}")
-        self.config = config
-        self.inference_api = inference_api
-        self.client = None
-        self.cache = {}
-        self.vector_store_table = None
-
-    async def initialize(self) -> None:
-        self.kvstore = await kvstore_impl(self.config.persistence)
-        self.vector_store_table = self.kvstore
-
-        if isinstance(self.config, RemoteChromaVectorIOConfig):
-            log.info(f"Connecting to Chroma server at: {self.config.url}")
-            url = self.config.url.rstrip("/")
-            parsed = urlparse(url)
-
-            if parsed.path and parsed.path != "/":
-                raise ValueError("URL should not contain a path")
-
-            self.client = await chromadb.AsyncHttpClient(host=parsed.hostname, port=parsed.port)
-        else:
-            log.info(f"Connecting to Chroma local db at: {self.config.db_path}")
-            self.client = chromadb.PersistentClient(path=self.config.db_path)
-        self.openai_vector_stores = await self._load_openai_vector_stores()
-
-    async def shutdown(self) -> None:
-        # Clean up mixin resources (file batch tasks)
-        await super().shutdown()
-
-    async def register_vector_store(self, vector_store: VectorStore) -> None:
-        collection = await maybe_await(
-            self.client.get_or_create_collection(
-                name=vector_store.identifier, metadata={"vector_store": vector_store.model_dump_json()}
-            )
-        )
-        self.cache[vector_store.identifier] = VectorStoreWithIndex(
-            vector_store, ChromaIndex(self.client, collection), self.inference_api
-        )
-
-    async def unregister_vector_store(self, vector_store_id: str) -> None:
-        if vector_store_id not in self.cache:
-            log.warning(f"Vector DB {vector_store_id} not found")
-            return
-
-        await self.cache[vector_store_id].index.delete()
-        del self.cache[vector_store_id]
-
-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
-        if index is None:
-            raise ValueError(f"Vector DB {vector_db_id} not found in Chroma")
-
-        await index.insert_chunks(chunks)
-
-    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
-    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
-
-        if index is None:
-            raise ValueError(f"Vector DB {vector_db_id} not found in Chroma")
-
-        return await index.query_chunks(query, params)
-
-    async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex:
-        if vector_store_id in self.cache:
-            return self.cache[vector_store_id]
-
-        vector_store = await self.vector_store_table.get_vector_store(vector_store_id)
-        if not vector_store:
-            raise ValueError(f"Vector DB {vector_store_id} not found in Llama Stack")
-        collection = await maybe_await(self.client.get_collection(vector_store_id))
-        if not collection:
-            raise ValueError(f"Vector DB {vector_store_id} not found in Chroma")
-        index = VectorStoreWithIndex(vector_store, ChromaIndex(self.client, collection), self.inference_api)
-        self.cache[vector_store_id] = index
-        return index
-
-    async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:
-        """Delete chunks from a Chroma vector store."""
-        index = await self._get_and_cache_vector_store_index(store_id)
-        if not index:
-            raise ValueError(f"Vector DB {store_id} not found")
-
-        await index.index.delete_chunks(chunks_for_deletion)
--- a/llama_stack/providers/remote/vector_io/chroma/config.py
+++ b/llama_stack/providers/remote/vector_io/chroma/config.py
@ -1,28 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-from llama_stack.schema_utils import json_schema_type
-
-
-@json_schema_type
-class ChromaVectorIOConfig(BaseModel):
-    url: str | None
-    persistence: KVStoreReference = Field(description="Config for KV store backend")
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, url: str = "${env.CHROMADB_URL}", **kwargs: Any) -> dict[str, Any]:
-        return {
-            "url": url,
-            "persistence": KVStoreReference(
-                backend="kv_default",
-                namespace="vector_io::chroma_remote",
-            ).model_dump(exclude_none=True),
-        }
--- a/llama_stack/providers/remote/vector_io/milvus/init.py
+++ b/llama_stack/providers/remote/vector_io/milvus/init.py
@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.providers.datatypes import Api, ProviderSpec
-
-from .config import MilvusVectorIOConfig
-
-
-async def get_adapter_impl(config: MilvusVectorIOConfig, deps: dict[Api, ProviderSpec]):
-    from .milvus import MilvusVectorIOAdapter
-
-    assert isinstance(config, MilvusVectorIOConfig), f"Unexpected config type: {type(config)}"
-    impl = MilvusVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files))
-    await impl.initialize()
-    return impl
--- a/llama_stack/providers/remote/vector_io/milvus/config.py
+++ b/llama_stack/providers/remote/vector_io/milvus/config.py
@ -1,35 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from pydantic import BaseModel, ConfigDict, Field
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-from llama_stack.schema_utils import json_schema_type
-
-
-@json_schema_type
-class MilvusVectorIOConfig(BaseModel):
-    uri: str = Field(description="The URI of the Milvus server")
-    token: str | None = Field(description="The token of the Milvus server")
-    consistency_level: str = Field(description="The consistency level of the Milvus server", default="Strong")
-    persistence: KVStoreReference = Field(description="Config for KV store backend")
-
-    # This configuration allows additional fields to be passed through to the underlying Milvus client.
-    # See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general.
-    model_config = ConfigDict(extra="allow")
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {
-            "uri": "${env.MILVUS_ENDPOINT}",
-            "token": "${env.MILVUS_TOKEN}",
-            "persistence": KVStoreReference(
-                backend="kv_default",
-                namespace="vector_io::milvus_remote",
-            ).model_dump(exclude_none=True),
-        }
--- a/llama_stack/providers/remote/vector_io/milvus/milvus.py
+++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py
@ -1,372 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import asyncio
-import os
-from typing import Any
-
-from numpy.typing import NDArray
-from pymilvus import AnnSearchRequest, DataType, Function, FunctionType, MilvusClient, RRFRanker, WeightedRanker
-
-from llama_stack.apis.common.errors import VectorStoreNotFoundError
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference, InterleavedContent
-from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import VectorStoresProtocolPrivate
-from llama_stack.providers.inline.vector_io.milvus import MilvusVectorIOConfig as InlineMilvusVectorIOConfig
-from llama_stack.providers.utils.kvstore import kvstore_impl
-from llama_stack.providers.utils.kvstore.api import KVStore
-from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
-from llama_stack.providers.utils.memory.vector_store import (
-    RERANKER_TYPE_WEIGHTED,
-    ChunkForDeletion,
-    EmbeddingIndex,
-    VectorStoreWithIndex,
-)
-from llama_stack.providers.utils.vector_io.vector_utils import sanitize_collection_name
-
-from .config import MilvusVectorIOConfig as RemoteMilvusVectorIOConfig
-
-logger = get_logger(name=__name__, category="vector_io::milvus")
-
-VERSION = "v3"
-VECTOR_DBS_PREFIX = f"vector_stores:milvus:{VERSION}::"
-VECTOR_INDEX_PREFIX = f"vector_index:milvus:{VERSION}::"
-OPENAI_VECTOR_STORES_PREFIX = f"openai_vector_stores:milvus:{VERSION}::"
-OPENAI_VECTOR_STORES_FILES_PREFIX = f"openai_vector_stores_files:milvus:{VERSION}::"
-OPENAI_VECTOR_STORES_FILES_CONTENTS_PREFIX = f"openai_vector_stores_files_contents:milvus:{VERSION}::"
-
-
-class MilvusIndex(EmbeddingIndex):
-    def __init__(
-        self, client: MilvusClient, collection_name: str, consistency_level="Strong", kvstore: KVStore | None = None
-    ):
-        self.client = client
-        self.collection_name = sanitize_collection_name(collection_name)
-        self.consistency_level = consistency_level
-        self.kvstore = kvstore
-
-    async def initialize(self):
-        # MilvusIndex does not require explicit initialization
-        # TODO: could move collection creation into initialization but it is not really necessary
-        pass
-
-    async def delete(self):
-        if await asyncio.to_thread(self.client.has_collection, self.collection_name):
-            await asyncio.to_thread(self.client.drop_collection, collection_name=self.collection_name)
-
-    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
-        assert len(chunks) == len(embeddings), (
-            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
-        )
-
-        if not await asyncio.to_thread(self.client.has_collection, self.collection_name):
-            logger.info(f"Creating new collection {self.collection_name} with nullable sparse field")
-            # Create schema for vector search
-            schema = self.client.create_schema()
-            schema.add_field(field_name="chunk_id", datatype=DataType.VARCHAR, is_primary=True, max_length=100)
-            schema.add_field(
-                field_name="content",
-                datatype=DataType.VARCHAR,
-                max_length=65535,
-                enable_analyzer=True,  # Enable text analysis for BM25
-            )
-            schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=len(embeddings[0]))
-            schema.add_field(field_name="chunk_content", datatype=DataType.JSON)
-            # Add sparse vector field for BM25 (required by the function)
-            schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)
-
-            # Create indexes
-            index_params = self.client.prepare_index_params()
-            index_params.add_index(field_name="vector", index_type="FLAT", metric_type="COSINE")
-            # Add index for sparse field (required by BM25 function)
-            index_params.add_index(field_name="sparse", index_type="SPARSE_INVERTED_INDEX", metric_type="BM25")
-
-            # Add BM25 function for full-text search
-            bm25_function = Function(
-                name="text_bm25_emb",
-                input_field_names=["content"],
-                output_field_names=["sparse"],
-                function_type=FunctionType.BM25,
-            )
-            schema.add_function(bm25_function)
-
-            await asyncio.to_thread(
-                self.client.create_collection,
-                self.collection_name,
-                schema=schema,
-                index_params=index_params,
-                consistency_level=self.consistency_level,
-            )
-
-        data = []
-        for chunk, embedding in zip(chunks, embeddings, strict=False):
-            data.append(
-                {
-                    "chunk_id": chunk.chunk_id,
-                    "content": chunk.content,
-                    "vector": embedding,
-                    "chunk_content": chunk.model_dump(),
-                    # sparse field will be handled by BM25 function automatically
-                }
-            )
-        try:
-            await asyncio.to_thread(self.client.insert, self.collection_name, data=data)
-        except Exception as e:
-            logger.error(f"Error inserting chunks into Milvus collection {self.collection_name}: {e}")
-            raise e
-
-    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
-        search_res = await asyncio.to_thread(
-            self.client.search,
-            collection_name=self.collection_name,
-            data=[embedding],
-            anns_field="vector",
-            limit=k,
-            output_fields=["*"],
-            search_params={"params": {"radius": score_threshold}},
-        )
-        chunks = [Chunk(**res["entity"]["chunk_content"]) for res in search_res[0]]
-        scores = [res["distance"] for res in search_res[0]]
-        return QueryChunksResponse(chunks=chunks, scores=scores)
-
-    async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
-        """
-        Perform BM25-based keyword search using Milvus's built-in full-text search.
-        """
-        try:
-            # Use Milvus's built-in BM25 search
-            search_res = await asyncio.to_thread(
-                self.client.search,
-                collection_name=self.collection_name,
-                data=[query_string],  # Raw text query
-                anns_field="sparse",  # Use sparse field for BM25
-                output_fields=["chunk_content"],  # Output the chunk content
-                limit=k,
-                search_params={
-                    "params": {
-                        "drop_ratio_search": 0.2,  # Ignore low-importance terms
-                    }
-                },
-            )
-
-            chunks = []
-            scores = []
-            for res in search_res[0]:
-                chunk = Chunk(**res["entity"]["chunk_content"])
-                chunks.append(chunk)
-                scores.append(res["distance"])  # BM25 score from Milvus
-
-            # Filter by score threshold
-            filtered_chunks = [chunk for chunk, score in zip(chunks, scores, strict=False) if score >= score_threshold]
-            filtered_scores = [score for score in scores if score >= score_threshold]
-
-            return QueryChunksResponse(chunks=filtered_chunks, scores=filtered_scores)
-
-        except Exception as e:
-            logger.error(f"Error performing BM25 search: {e}")
-            # Fallback to simple text search
-            return await self._fallback_keyword_search(query_string, k, score_threshold)
-
-    async def _fallback_keyword_search(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
-        """
-        Fallback to simple text search when BM25 search is not available.
-        """
-        # Simple text search using content field
-        search_res = await asyncio.to_thread(
-            self.client.query,
-            collection_name=self.collection_name,
-            filter='content like "%{content}%"',
-            filter_params={"content": query_string},
-            output_fields=["*"],
-            limit=k,
-        )
-        chunks = [Chunk(**res["chunk_content"]) for res in search_res]
-        scores = [1.0] * len(chunks)  # Simple binary score for text search
-        return QueryChunksResponse(chunks=chunks, scores=scores)
-
-    async def query_hybrid(
-        self,
-        embedding: NDArray,
-        query_string: str,
-        k: int,
-        score_threshold: float,
-        reranker_type: str,
-        reranker_params: dict[str, Any] | None = None,
-    ) -> QueryChunksResponse:
-        """
-        Hybrid search using Milvus's native hybrid search capabilities.
-
-        This implementation uses Milvus's hybrid_search method which combines
-        vector search and BM25 search with configurable reranking strategies.
-        """
-        search_requests = []
-
-        # nprobe: Controls search accuracy vs performance trade-off
-        # 10 balances these trade-offs for  RAG applications
-        search_requests.append(
-            AnnSearchRequest(data=[embedding.tolist()], anns_field="vector", param={"nprobe": 10}, limit=k)
-        )
-
-        # drop_ratio_search: Filters low-importance terms to improve search performance
-        # 0.2 balances noise reduction with recall
-        search_requests.append(
-            AnnSearchRequest(data=[query_string], anns_field="sparse", param={"drop_ratio_search": 0.2}, limit=k)
-        )
-
-        if reranker_type == RERANKER_TYPE_WEIGHTED:
-            alpha = (reranker_params or {}).get("alpha", 0.5)
-            rerank = WeightedRanker(alpha, 1 - alpha)
-        else:
-            impact_factor = (reranker_params or {}).get("impact_factor", 60.0)
-            rerank = RRFRanker(impact_factor)
-
-        search_res = await asyncio.to_thread(
-            self.client.hybrid_search,
-            collection_name=self.collection_name,
-            reqs=search_requests,
-            ranker=rerank,
-            limit=k,
-            output_fields=["chunk_content"],
-        )
-
-        chunks = []
-        scores = []
-        for res in search_res[0]:
-            chunk = Chunk(**res["entity"]["chunk_content"])
-            chunks.append(chunk)
-            scores.append(res["distance"])
-
-        filtered_chunks = [chunk for chunk, score in zip(chunks, scores, strict=False) if score >= score_threshold]
-        filtered_scores = [score for score in scores if score >= score_threshold]
-
-        return QueryChunksResponse(chunks=filtered_chunks, scores=filtered_scores)
-
-    async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None:
-        """Remove a chunk from the Milvus collection."""
-        chunk_ids = [c.chunk_id for c in chunks_for_deletion]
-        try:
-            # Use IN clause with square brackets and single quotes for VARCHAR field
-            chunk_ids_str = ", ".join(f"'{chunk_id}'" for chunk_id in chunk_ids)
-            await asyncio.to_thread(
-                self.client.delete, collection_name=self.collection_name, filter=f"chunk_id in [{chunk_ids_str}]"
-            )
-        except Exception as e:
-            logger.error(f"Error deleting chunks from Milvus collection {self.collection_name}: {e}")
-            raise
-
-
-class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
-    def __init__(
-        self,
-        config: RemoteMilvusVectorIOConfig | InlineMilvusVectorIOConfig,
-        inference_api: Inference,
-        files_api: Files | None,
-    ) -> None:
-        super().__init__(files_api=files_api, kvstore=None)
-        self.config = config
-        self.cache = {}
-        self.client = None
-        self.inference_api = inference_api
-        self.vector_store_table = None
-        self.metadata_collection_name = "openai_vector_stores_metadata"
-
-    async def initialize(self) -> None:
-        self.kvstore = await kvstore_impl(self.config.persistence)
-        start_key = VECTOR_DBS_PREFIX
-        end_key = f"{VECTOR_DBS_PREFIX}\xff"
-        stored_vector_stores = await self.kvstore.values_in_range(start_key, end_key)
-
-        for vector_store_data in stored_vector_stores:
-            vector_store = VectorStore.model_validate_json(vector_store_data)
-            index = VectorStoreWithIndex(
-                vector_store,
-                index=MilvusIndex(
-                    client=self.client,
-                    collection_name=vector_store.identifier,
-                    consistency_level=self.config.consistency_level,
-                    kvstore=self.kvstore,
-                ),
-                inference_api=self.inference_api,
-            )
-            self.cache[vector_store.identifier] = index
-        if isinstance(self.config, RemoteMilvusVectorIOConfig):
-            logger.info(f"Connecting to Milvus server at {self.config.uri}")
-            self.client = MilvusClient(**self.config.model_dump(exclude_none=True))
-        else:
-            logger.info(f"Connecting to Milvus Lite at: {self.config.db_path}")
-            uri = os.path.expanduser(self.config.db_path)
-            self.client = MilvusClient(uri=uri)
-
-        # Load existing OpenAI vector stores into the in-memory cache
-        await self.initialize_openai_vector_stores()
-
-    async def shutdown(self) -> None:
-        self.client.close()
-        # Clean up mixin resources (file batch tasks)
-        await super().shutdown()
-
-    async def register_vector_store(self, vector_store: VectorStore) -> None:
-        if isinstance(self.config, RemoteMilvusVectorIOConfig):
-            consistency_level = self.config.consistency_level
-        else:
-            consistency_level = "Strong"
-        index = VectorStoreWithIndex(
-            vector_store=vector_store,
-            index=MilvusIndex(self.client, vector_store.identifier, consistency_level=consistency_level),
-            inference_api=self.inference_api,
-        )
-
-        self.cache[vector_store.identifier] = index
-
-    async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex | None:
-        if vector_store_id in self.cache:
-            return self.cache[vector_store_id]
-
-        if self.vector_store_table is None:
-            raise VectorStoreNotFoundError(vector_store_id)
-
-        vector_store = await self.vector_store_table.get_vector_store(vector_store_id)
-        if not vector_store:
-            raise VectorStoreNotFoundError(vector_store_id)
-
-        index = VectorStoreWithIndex(
-            vector_store=vector_store,
-            index=MilvusIndex(client=self.client, collection_name=vector_store.identifier, kvstore=self.kvstore),
-            inference_api=self.inference_api,
-        )
-        self.cache[vector_store_id] = index
-        return index
-
-    async def unregister_vector_store(self, vector_store_id: str) -> None:
-        if vector_store_id in self.cache:
-            await self.cache[vector_store_id].index.delete()
-            del self.cache[vector_store_id]
-
-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
-        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
-
-        await index.insert_chunks(chunks)
-
-    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
-    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
-        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
-        return await index.query_chunks(query, params)
-
-    async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:
-        """Delete a chunk from a milvus vector store."""
-        index = await self._get_and_cache_vector_store_index(store_id)
-        if not index:
-            raise VectorStoreNotFoundError(store_id)
-
-        await index.index.delete_chunks(chunks_for_deletion)
--- a/llama_stack/providers/remote/vector_io/pgvector/init.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/init.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.providers.datatypes import Api, ProviderSpec
-
-from .config import PGVectorVectorIOConfig
-
-
-async def get_adapter_impl(config: PGVectorVectorIOConfig, deps: dict[Api, ProviderSpec]):
-    from .pgvector import PGVectorVectorIOAdapter
-
-    impl = PGVectorVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files))
-    await impl.initialize()
-    return impl
--- a/llama_stack/providers/remote/vector_io/pgvector/config.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/config.py
@ -1,47 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-from llama_stack.schema_utils import json_schema_type
-
-
-@json_schema_type
-class PGVectorVectorIOConfig(BaseModel):
-    host: str | None = Field(default="localhost")
-    port: int | None = Field(default=5432)
-    db: str | None = Field(default="postgres")
-    user: str | None = Field(default="postgres")
-    password: str | None = Field(default="mysecretpassword")
-    persistence: KVStoreReference | None = Field(
-        description="Config for KV store backend (SQLite only for now)", default=None
-    )
-
-    @classmethod
-    def sample_run_config(
-        cls,
-        __distro_dir__: str,
-        host: str = "${env.PGVECTOR_HOST:=localhost}",
-        port: int = "${env.PGVECTOR_PORT:=5432}",
-        db: str = "${env.PGVECTOR_DB}",
-        user: str = "${env.PGVECTOR_USER}",
-        password: str = "${env.PGVECTOR_PASSWORD}",
-        **kwargs: Any,
-    ) -> dict[str, Any]:
-        return {
-            "host": host,
-            "port": port,
-            "db": db,
-            "user": user,
-            "password": password,
-            "persistence": KVStoreReference(
-                backend="kv_default",
-                namespace="vector_io::pgvector",
-            ).model_dump(exclude_none=True),
-        }
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@ -1,434 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import heapq
-from typing import Any
-
-import psycopg2
-from numpy.typing import NDArray
-from psycopg2 import sql
-from psycopg2.extras import Json, execute_values
-from pydantic import BaseModel, TypeAdapter
-
-from llama_stack.apis.common.errors import VectorStoreNotFoundError
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference, InterleavedContent
-from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import VectorStoresProtocolPrivate
-from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
-from llama_stack.providers.utils.kvstore import kvstore_impl
-from llama_stack.providers.utils.kvstore.api import KVStore
-from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
-from llama_stack.providers.utils.memory.vector_store import ChunkForDeletion, EmbeddingIndex, VectorStoreWithIndex
-from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator, sanitize_collection_name
-
-from .config import PGVectorVectorIOConfig
-
-log = get_logger(name=__name__, category="vector_io::pgvector")
-
-VERSION = "v3"
-VECTOR_DBS_PREFIX = f"vector_stores:pgvector:{VERSION}::"
-VECTOR_INDEX_PREFIX = f"vector_index:pgvector:{VERSION}::"
-OPENAI_VECTOR_STORES_PREFIX = f"openai_vector_stores:pgvector:{VERSION}::"
-OPENAI_VECTOR_STORES_FILES_PREFIX = f"openai_vector_stores_files:pgvector:{VERSION}::"
-OPENAI_VECTOR_STORES_FILES_CONTENTS_PREFIX = f"openai_vector_stores_files_contents:pgvector:{VERSION}::"
-
-
-def check_extension_version(cur):
-    cur.execute("SELECT extversion FROM pg_extension WHERE extname = 'vector'")
-    result = cur.fetchone()
-    return result[0] if result else None
-
-
-def upsert_models(conn, keys_models: list[tuple[str, BaseModel]]):
-    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        query = sql.SQL(
-            """
-            INSERT INTO metadata_store (key, data)
-            VALUES %s
-            ON CONFLICT (key) DO UPDATE
-            SET data = EXCLUDED.data
-        """
-        )
-
-        values = [(key, Json(model.model_dump())) for key, model in keys_models]
-        execute_values(cur, query, values, template="(%s, %s)")
-
-
-def load_models(cur, cls):
-    cur.execute("SELECT key, data FROM metadata_store")
-    rows = cur.fetchall()
-    return [TypeAdapter(cls).validate_python(row["data"]) for row in rows]
-
-
-class PGVectorIndex(EmbeddingIndex):
-    # reference: https://github.com/pgvector/pgvector?tab=readme-ov-file#querying
-    PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION: dict[str, str] = {
-        "L2": "<->",
-        "L1": "<+>",
-        "COSINE": "<=>",
-        "INNER_PRODUCT": "<#>",
-        "HAMMING": "<~>",
-        "JACCARD": "<%>",
-    }
-
-    def __init__(
-        self,
-        vector_store: VectorStore,
-        dimension: int,
-        conn: psycopg2.extensions.connection,
-        kvstore: KVStore | None = None,
-        distance_metric: str = "COSINE",
-    ):
-        self.vector_store = vector_store
-        self.dimension = dimension
-        self.conn = conn
-        self.kvstore = kvstore
-        self.check_distance_metric_availability(distance_metric)
-        self.distance_metric = distance_metric
-        self.table_name = None
-
-    async def initialize(self) -> None:
-        try:
-            with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-                # Sanitize the table name by replacing hyphens with underscores
-                # SQL doesn't allow hyphens in table names, and vector_store.identifier may contain hyphens
-                # when created with patterns like "test-vector-db-{uuid4()}"
-                sanitized_identifier = sanitize_collection_name(self.vector_store.identifier)
-                self.table_name = f"vs_{sanitized_identifier}"
-
-                cur.execute(
-                    f"""
-                    CREATE TABLE IF NOT EXISTS {self.table_name} (
-                        id TEXT PRIMARY KEY,
-                        document JSONB,
-                        embedding vector({self.dimension}),
-                        content_text TEXT,
-                        tokenized_content TSVECTOR
-                    )
-                """
-                )
-
-                # Create GIN index for full-text search performance
-                cur.execute(
-                    f"""
-                    CREATE INDEX IF NOT EXISTS {self.table_name}_content_gin_idx
-                    ON {self.table_name} USING GIN(tokenized_content)
-                """
-                )
-        except Exception as e:
-            log.exception(f"Error creating PGVectorIndex for vector_store: {self.vector_store.identifier}")
-            raise RuntimeError(f"Error creating PGVectorIndex for vector_store: {self.vector_store.identifier}") from e
-
-    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
-        assert len(chunks) == len(embeddings), (
-            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
-        )
-
-        values = []
-        for i, chunk in enumerate(chunks):
-            content_text = interleaved_content_as_str(chunk.content)
-            values.append(
-                (
-                    f"{chunk.chunk_id}",
-                    Json(chunk.model_dump()),
-                    embeddings[i].tolist(),
-                    content_text,
-                    content_text,  # Pass content_text twice - once for content_text column, once for to_tsvector function. Eg. to_tsvector(content_text) = tokenized_content
-                )
-            )
-
-        query = sql.SQL(
-            f"""
-        INSERT INTO {self.table_name} (id, document, embedding, content_text, tokenized_content)
-        VALUES %s
-        ON CONFLICT (id) DO UPDATE SET
-            embedding = EXCLUDED.embedding,
-            document = EXCLUDED.document,
-            content_text = EXCLUDED.content_text,
-            tokenized_content = EXCLUDED.tokenized_content
-    """
-        )
-        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            execute_values(cur, query, values, template="(%s, %s, %s::vector, %s, to_tsvector('english', %s))")
-
-    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
-        """
-        Performs vector similarity search using PostgreSQL's search function. Default distance metric is COSINE.
-
-        Args:
-            embedding: The query embedding vector
-            k: Number of results to return
-            score_threshold: Minimum similarity score threshold
-
-        Returns:
-            QueryChunksResponse with combined results
-        """
-        pgvector_search_function = self.get_pgvector_search_function()
-
-        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            cur.execute(
-                f"""
-            SELECT document, embedding {pgvector_search_function} %s::vector AS distance
-            FROM {self.table_name}
-            ORDER BY distance
-            LIMIT %s
-        """,
-                (embedding.tolist(), k),
-            )
-            results = cur.fetchall()
-
-            chunks = []
-            scores = []
-            for doc, dist in results:
-                score = 1.0 / float(dist) if dist != 0 else float("inf")
-                if score < score_threshold:
-                    continue
-                chunks.append(Chunk(**doc))
-                scores.append(score)
-
-            return QueryChunksResponse(chunks=chunks, scores=scores)
-
-    async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
-        """
-        Performs keyword-based search using PostgreSQL's full-text search with ts_rank scoring.
-
-        Args:
-            query_string: The text query for keyword search
-            k: Number of results to return
-            score_threshold: Minimum similarity score threshold
-
-        Returns:
-            QueryChunksResponse with combined results
-        """
-        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            # Use plainto_tsquery to handle user input safely and ts_rank for relevance scoring
-            cur.execute(
-                f"""
-            SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
-            FROM {self.table_name}
-            WHERE tokenized_content @@ plainto_tsquery('english', %s)
-            ORDER BY score DESC
-            LIMIT %s
-        """,
-                (query_string, query_string, k),
-            )
-            results = cur.fetchall()
-
-            chunks = []
-            scores = []
-            for doc, score in results:
-                if score < score_threshold:
-                    continue
-                chunks.append(Chunk(**doc))
-                scores.append(float(score))
-
-            return QueryChunksResponse(chunks=chunks, scores=scores)
-
-    async def query_hybrid(
-        self,
-        embedding: NDArray,
-        query_string: str,
-        k: int,
-        score_threshold: float,
-        reranker_type: str,
-        reranker_params: dict[str, Any] | None = None,
-    ) -> QueryChunksResponse:
-        """
-        Hybrid search combining vector similarity and keyword search using configurable reranking.
-
-        Args:
-            embedding: The query embedding vector
-            query_string: The text query for keyword search
-            k: Number of results to return
-            score_threshold: Minimum similarity score threshold
-            reranker_type: Type of reranker to use ("rrf" or "weighted")
-            reranker_params: Parameters for the reranker
-
-        Returns:
-            QueryChunksResponse with combined results
-        """
-        if reranker_params is None:
-            reranker_params = {}
-
-        # Get results from both search methods
-        vector_response = await self.query_vector(embedding, k, score_threshold)
-        keyword_response = await self.query_keyword(query_string, k, score_threshold)
-
-        # Convert responses to score dictionaries using chunk_id
-        vector_scores = {
-            chunk.chunk_id: score for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
-        }
-        keyword_scores = {
-            chunk.chunk_id: score
-            for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
-        }
-
-        # Combine scores using the reranking utility
-        combined_scores = WeightedInMemoryAggregator.combine_search_results(
-            vector_scores, keyword_scores, reranker_type, reranker_params
-        )
-
-        # Efficient top-k selection because it only tracks the k best candidates it's seen so far
-        top_k_items = heapq.nlargest(k, combined_scores.items(), key=lambda x: x[1])
-
-        # Filter by score threshold
-        filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold]
-
-        # Create a map of chunk_id to chunk for both responses
-        chunk_map = {c.chunk_id: c for c in vector_response.chunks + keyword_response.chunks}
-
-        # Use the map to look up chunks by their IDs
-        chunks = []
-        scores = []
-        for doc_id, score in filtered_items:
-            if doc_id in chunk_map:
-                chunks.append(chunk_map[doc_id])
-                scores.append(score)
-
-        return QueryChunksResponse(chunks=chunks, scores=scores)
-
-    async def delete(self):
-        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            cur.execute(f"DROP TABLE IF EXISTS {self.table_name}")
-
-    async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None:
-        """Remove a chunk from the PostgreSQL table."""
-        chunk_ids = [c.chunk_id for c in chunks_for_deletion]
-        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            cur.execute(f"DELETE FROM {self.table_name} WHERE id = ANY(%s)", (chunk_ids))
-
-    def get_pgvector_search_function(self) -> str:
-        return self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION[self.distance_metric]
-
-    def check_distance_metric_availability(self, distance_metric: str) -> None:
-        """Check if the distance metric is supported by PGVector.
-
-        Args:
-            distance_metric: The distance metric to check
-
-        Raises:
-            ValueError: If the distance metric is not supported
-        """
-        if distance_metric not in self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION:
-            supported_metrics = list(self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION.keys())
-            raise ValueError(
-                f"Distance metric '{distance_metric}' is not supported by PGVector. "
-                f"Supported metrics are: {', '.join(supported_metrics)}"
-            )
-
-
-class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
-    def __init__(
-        self, config: PGVectorVectorIOConfig, inference_api: Inference, files_api: Files | None = None
-    ) -> None:
-        super().__init__(files_api=files_api, kvstore=None)
-        self.config = config
-        self.inference_api = inference_api
-        self.conn = None
-        self.cache = {}
-        self.vector_store_table = None
-        self.metadata_collection_name = "openai_vector_stores_metadata"
-
-    async def initialize(self) -> None:
-        log.info(f"Initializing PGVector memory adapter with config: {self.config}")
-        self.kvstore = await kvstore_impl(self.config.persistence)
-        await self.initialize_openai_vector_stores()
-
-        try:
-            self.conn = psycopg2.connect(
-                host=self.config.host,
-                port=self.config.port,
-                database=self.config.db,
-                user=self.config.user,
-                password=self.config.password,
-            )
-            self.conn.autocommit = True
-            with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-                version = check_extension_version(cur)
-                if version:
-                    log.info(f"Vector extension version: {version}")
-                else:
-                    raise RuntimeError("Vector extension is not installed.")
-
-                cur.execute(
-                    """
-                    CREATE TABLE IF NOT EXISTS metadata_store (
-                        key TEXT PRIMARY KEY,
-                        data JSONB
-                    )
-                """
-                )
-        except Exception as e:
-            log.exception("Could not connect to PGVector database server")
-            raise RuntimeError("Could not connect to PGVector database server") from e
-
-    async def shutdown(self) -> None:
-        if self.conn is not None:
-            self.conn.close()
-            log.info("Connection to PGVector database server closed")
-        # Clean up mixin resources (file batch tasks)
-        await super().shutdown()
-
-    async def register_vector_store(self, vector_store: VectorStore) -> None:
-        # Persist vector DB metadata in the KV store
-        assert self.kvstore is not None
-        # Upsert model metadata in Postgres
-        upsert_models(self.conn, [(vector_store.identifier, vector_store)])
-
-        # Create and cache the PGVector index table for the vector DB
-        pgvector_index = PGVectorIndex(
-            vector_store=vector_store, dimension=vector_store.embedding_dimension, conn=self.conn, kvstore=self.kvstore
-        )
-        await pgvector_index.initialize()
-        index = VectorStoreWithIndex(vector_store, index=pgvector_index, inference_api=self.inference_api)
-        self.cache[vector_store.identifier] = index
-
-    async def unregister_vector_store(self, vector_store_id: str) -> None:
-        # Remove provider index and cache
-        if vector_store_id in self.cache:
-            await self.cache[vector_store_id].index.delete()
-            del self.cache[vector_store_id]
-
-        # Delete vector DB metadata from KV store
-        assert self.kvstore is not None
-        await self.kvstore.delete(key=f"{VECTOR_DBS_PREFIX}{vector_store_id}")
-
-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
-        await index.insert_chunks(chunks)
-
-    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
-    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
-        return await index.query_chunks(query, params)
-
-    async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex:
-        if vector_store_id in self.cache:
-            return self.cache[vector_store_id]
-
-        if self.vector_store_table is None:
-            raise VectorStoreNotFoundError(vector_store_id)
-
-        vector_store = await self.vector_store_table.get_vector_store(vector_store_id)
-        if not vector_store:
-            raise VectorStoreNotFoundError(vector_store_id)
-
-        index = PGVectorIndex(vector_store, vector_store.embedding_dimension, self.conn)
-        await index.initialize()
-        self.cache[vector_store_id] = VectorStoreWithIndex(vector_store, index, self.inference_api)
-        return self.cache[vector_store_id]
-
-    async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:
-        """Delete a chunk from a PostgreSQL vector store."""
-        index = await self._get_and_cache_vector_store_index(store_id)
-        if not index:
-            raise VectorStoreNotFoundError(store_id)
-
-        await index.index.delete_chunks(chunks_for_deletion)
--- a/llama_stack/providers/remote/vector_io/qdrant/init.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/init.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.providers.datatypes import Api, ProviderSpec
-
-from .config import QdrantVectorIOConfig
-
-
-async def get_adapter_impl(config: QdrantVectorIOConfig, deps: dict[Api, ProviderSpec]):
-    from .qdrant import QdrantVectorIOAdapter
-
-    impl = QdrantVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files))
-    await impl.initialize()
-    return impl
--- a/llama_stack/providers/remote/vector_io/qdrant/config.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/config.py
@ -1,37 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from pydantic import BaseModel
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-from llama_stack.schema_utils import json_schema_type
-
-
-@json_schema_type
-class QdrantVectorIOConfig(BaseModel):
-    location: str | None = None
-    url: str | None = None
-    port: int | None = 6333
-    grpc_port: int = 6334
-    prefer_grpc: bool = False
-    https: bool | None = None
-    api_key: str | None = None
-    prefix: str | None = None
-    timeout: int | None = None
-    host: str | None = None
-    persistence: KVStoreReference
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {
-            "api_key": "${env.QDRANT_API_KEY:=}",
-            "persistence": KVStoreReference(
-                backend="kv_default",
-                namespace="vector_io::qdrant_remote",
-            ).model_dump(exclude_none=True),
-        }
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@ -1,260 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import asyncio
-import hashlib
-import uuid
-from typing import Any
-
-from numpy.typing import NDArray
-from qdrant_client import AsyncQdrantClient, models
-from qdrant_client.models import PointStruct
-
-from llama_stack.apis.common.errors import VectorStoreNotFoundError
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference, InterleavedContent
-from llama_stack.apis.vector_io import (
-    Chunk,
-    QueryChunksResponse,
-    VectorIO,
-    VectorStoreChunkingStrategy,
-    VectorStoreFileObject,
-)
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import VectorStoresProtocolPrivate
-from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
-from llama_stack.providers.utils.kvstore import kvstore_impl
-from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
-from llama_stack.providers.utils.memory.vector_store import ChunkForDeletion, EmbeddingIndex, VectorStoreWithIndex
-
-from .config import QdrantVectorIOConfig as RemoteQdrantVectorIOConfig
-
-log = get_logger(name=__name__, category="vector_io::qdrant")
-CHUNK_ID_KEY = "_chunk_id"
-
-# KV store prefixes for vector databases
-VERSION = "v3"
-VECTOR_DBS_PREFIX = f"vector_stores:qdrant:{VERSION}::"
-
-
-def convert_id(_id: str) -> str:
-    """
-    Converts any string into a UUID string based on a seed.
-
-    Qdrant accepts UUID strings and unsigned integers as point ID.
-    We use a SHA-256 hash to convert each string into a UUID string deterministically.
-    This allows us to overwrite the same point with the original ID.
-    """
-    hash_input = f"qdrant_id:{_id}".encode()
-    sha256_hash = hashlib.sha256(hash_input).hexdigest()
-    # Use the first 32 characters to create a valid UUID
-    return str(uuid.UUID(sha256_hash[:32]))
-
-
-class QdrantIndex(EmbeddingIndex):
-    def __init__(self, client: AsyncQdrantClient, collection_name: str):
-        self.client = client
-        self.collection_name = collection_name
-
-    async def initialize(self) -> None:
-        # Qdrant collections are created on-demand in add_chunks
-        # If the collection does not exist, it will be created in add_chunks.
-        pass
-
-    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
-        assert len(chunks) == len(embeddings), (
-            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
-        )
-
-        if not await self.client.collection_exists(self.collection_name):
-            await self.client.create_collection(
-                self.collection_name,
-                vectors_config=models.VectorParams(size=len(embeddings[0]), distance=models.Distance.COSINE),
-            )
-
-        points = []
-        for _i, (chunk, embedding) in enumerate(zip(chunks, embeddings, strict=False)):
-            chunk_id = chunk.chunk_id
-            points.append(
-                PointStruct(
-                    id=convert_id(chunk_id),
-                    vector=embedding,
-                    payload={"chunk_content": chunk.model_dump()} | {CHUNK_ID_KEY: chunk_id},
-                )
-            )
-
-        await self.client.upsert(collection_name=self.collection_name, points=points)
-
-    async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None:
-        """Remove a chunk from the Qdrant collection."""
-        chunk_ids = [convert_id(c.chunk_id) for c in chunks_for_deletion]
-        try:
-            await self.client.delete(
-                collection_name=self.collection_name, points_selector=models.PointIdsList(points=chunk_ids)
-            )
-        except Exception as e:
-            log.error(f"Error deleting chunks from Qdrant collection {self.collection_name}: {e}")
-            raise
-
-    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
-        results = (
-            await self.client.query_points(
-                collection_name=self.collection_name,
-                query=embedding.tolist(),
-                limit=k,
-                with_payload=True,
-                score_threshold=score_threshold,
-            )
-        ).points
-
-        chunks, scores = [], []
-        for point in results:
-            assert isinstance(point, models.ScoredPoint)
-            assert point.payload is not None
-
-            try:
-                chunk = Chunk(**point.payload["chunk_content"])
-            except Exception:
-                log.exception("Failed to parse chunk")
-                continue
-
-            chunks.append(chunk)
-            scores.append(point.score)
-
-        return QueryChunksResponse(chunks=chunks, scores=scores)
-
-    async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
-        raise NotImplementedError("Keyword search is not supported in Qdrant")
-
-    async def query_hybrid(
-        self,
-        embedding: NDArray,
-        query_string: str,
-        k: int,
-        score_threshold: float,
-        reranker_type: str,
-        reranker_params: dict[str, Any] | None = None,
-    ) -> QueryChunksResponse:
-        raise NotImplementedError("Hybrid search is not supported in Qdrant")
-
-    async def delete(self):
-        await self.client.delete_collection(collection_name=self.collection_name)
-
-
-class QdrantVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
-    def __init__(
-        self,
-        config: RemoteQdrantVectorIOConfig | InlineQdrantVectorIOConfig,
-        inference_api: Inference,
-        files_api: Files | None = None,
-    ) -> None:
-        super().__init__(files_api=files_api, kvstore=None)
-        self.config = config
-        self.client: AsyncQdrantClient = None
-        self.cache = {}
-        self.inference_api = inference_api
-        self.vector_store_table = None
-        self._qdrant_lock = asyncio.Lock()
-
-    async def initialize(self) -> None:
-        client_config = self.config.model_dump(exclude_none=True, exclude={"persistence"})
-        self.client = AsyncQdrantClient(**client_config)
-        self.kvstore = await kvstore_impl(self.config.persistence)
-
-        start_key = VECTOR_DBS_PREFIX
-        end_key = f"{VECTOR_DBS_PREFIX}\xff"
-        stored_vector_stores = await self.kvstore.values_in_range(start_key, end_key)
-
-        for vector_store_data in stored_vector_stores:
-            vector_store = VectorStore.model_validate_json(vector_store_data)
-            index = VectorStoreWithIndex(
-                vector_store, QdrantIndex(self.client, vector_store.identifier), self.inference_api
-            )
-            self.cache[vector_store.identifier] = index
-        self.openai_vector_stores = await self._load_openai_vector_stores()
-
-    async def shutdown(self) -> None:
-        await self.client.close()
-        # Clean up mixin resources (file batch tasks)
-        await super().shutdown()
-
-    async def register_vector_store(self, vector_store: VectorStore) -> None:
-        assert self.kvstore is not None
-        key = f"{VECTOR_DBS_PREFIX}{vector_store.identifier}"
-        await self.kvstore.set(key=key, value=vector_store.model_dump_json())
-
-        index = VectorStoreWithIndex(
-            vector_store=vector_store,
-            index=QdrantIndex(self.client, vector_store.identifier),
-            inference_api=self.inference_api,
-        )
-
-        self.cache[vector_store.identifier] = index
-
-    async def unregister_vector_store(self, vector_store_id: str) -> None:
-        if vector_store_id in self.cache:
-            await self.cache[vector_store_id].index.delete()
-            del self.cache[vector_store_id]
-
-        assert self.kvstore is not None
-        await self.kvstore.delete(f"{VECTOR_DBS_PREFIX}{vector_store_id}")
-
-    async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex | None:
-        if vector_store_id in self.cache:
-            return self.cache[vector_store_id]
-
-        if self.vector_store_table is None:
-            raise ValueError(f"Vector DB not found {vector_store_id}")
-
-        vector_store = await self.vector_store_table.get_vector_store(vector_store_id)
-        if not vector_store:
-            raise VectorStoreNotFoundError(vector_store_id)
-
-        index = VectorStoreWithIndex(
-            vector_store=vector_store,
-            index=QdrantIndex(client=self.client, collection_name=vector_store.identifier),
-            inference_api=self.inference_api,
-        )
-        self.cache[vector_store_id] = index
-        return index
-
-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
-        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
-
-        await index.insert_chunks(chunks)
-
-    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
-    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
-        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
-
-        return await index.query_chunks(query, params)
-
-    async def openai_attach_file_to_vector_store(
-        self,
-        vector_store_id: str,
-        file_id: str,
-        attributes: dict[str, Any] | None = None,
-        chunking_strategy: VectorStoreChunkingStrategy | None = None,
-    ) -> VectorStoreFileObject:
-        # Qdrant doesn't allow multiple clients to access the same storage path simultaneously.
-        async with self._qdrant_lock:
-            return await super().openai_attach_file_to_vector_store(
-                vector_store_id, file_id, attributes, chunking_strategy
-            )
-
-    async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:
-        """Delete chunks from a Qdrant vector store."""
-        index = await self._get_and_cache_vector_store_index(store_id)
-        if not index:
-            raise ValueError(f"Vector DB {store_id} not found")
-
-        await index.index.delete_chunks(chunks_for_deletion)
--- a/llama_stack/providers/remote/vector_io/weaviate/init.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/init.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.providers.datatypes import Api, ProviderSpec
-
-from .config import WeaviateVectorIOConfig
-
-
-async def get_adapter_impl(config: WeaviateVectorIOConfig, deps: dict[Api, ProviderSpec]):
-    from .weaviate import WeaviateVectorIOAdapter
-
-    impl = WeaviateVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files))
-    await impl.initialize()
-    return impl
--- a/llama_stack/providers/remote/vector_io/weaviate/config.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/config.py
@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-from llama_stack.schema_utils import json_schema_type
-
-
-@json_schema_type
-class WeaviateVectorIOConfig(BaseModel):
-    weaviate_api_key: str | None = Field(description="The API key for the Weaviate instance", default=None)
-    weaviate_cluster_url: str | None = Field(description="The URL of the Weaviate cluster", default="localhost:8080")
-    persistence: KVStoreReference | None = Field(
-        description="Config for KV store backend (SQLite only for now)", default=None
-    )
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {
-            "weaviate_api_key": None,
-            "weaviate_cluster_url": "${env.WEAVIATE_CLUSTER_URL:=localhost:8080}",
-            "persistence": KVStoreReference(
-                backend="kv_default",
-                namespace="vector_io::weaviate",
-            ).model_dump(exclude_none=True),
-        }
--- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
@ -1,390 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import json
-from typing import Any
-
-import weaviate
-import weaviate.classes as wvc
-from numpy.typing import NDArray
-from weaviate.classes.init import Auth
-from weaviate.classes.query import Filter, HybridFusion
-
-from llama_stack.apis.common.content_types import InterleavedContent
-from llama_stack.apis.common.errors import VectorStoreNotFoundError
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.core.request_headers import NeedsRequestProviderData
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import VectorStoresProtocolPrivate
-from llama_stack.providers.utils.kvstore import kvstore_impl
-from llama_stack.providers.utils.kvstore.api import KVStore
-from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
-from llama_stack.providers.utils.memory.vector_store import (
-    RERANKER_TYPE_RRF,
-    ChunkForDeletion,
-    EmbeddingIndex,
-    VectorStoreWithIndex,
-)
-from llama_stack.providers.utils.vector_io.vector_utils import sanitize_collection_name
-
-from .config import WeaviateVectorIOConfig
-
-log = get_logger(name=__name__, category="vector_io::weaviate")
-
-VERSION = "v3"
-VECTOR_DBS_PREFIX = f"vector_stores:weaviate:{VERSION}::"
-VECTOR_INDEX_PREFIX = f"vector_index:weaviate:{VERSION}::"
-OPENAI_VECTOR_STORES_PREFIX = f"openai_vector_stores:weaviate:{VERSION}::"
-OPENAI_VECTOR_STORES_FILES_PREFIX = f"openai_vector_stores_files:weaviate:{VERSION}::"
-OPENAI_VECTOR_STORES_FILES_CONTENTS_PREFIX = f"openai_vector_stores_files_contents:weaviate:{VERSION}::"
-
-
-class WeaviateIndex(EmbeddingIndex):
-    def __init__(self, client: weaviate.WeaviateClient, collection_name: str, kvstore: KVStore | None = None):
-        self.client = client
-        self.collection_name = sanitize_collection_name(collection_name, weaviate_format=True)
-        self.kvstore = kvstore
-
-    async def initialize(self):
-        pass
-
-    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
-        assert len(chunks) == len(embeddings), (
-            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
-        )
-
-        data_objects = []
-        for chunk, embedding in zip(chunks, embeddings, strict=False):
-            data_objects.append(
-                wvc.data.DataObject(
-                    properties={
-                        "chunk_id": chunk.chunk_id,
-                        "chunk_content": chunk.model_dump_json(),
-                    },
-                    vector=embedding.tolist(),
-                )
-            )
-
-        # Inserting chunks into a prespecified Weaviate collection
-        collection = self.client.collections.get(self.collection_name)
-
-        # TODO: make this async friendly
-        collection.data.insert_many(data_objects)
-
-    async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None:
-        sanitized_collection_name = sanitize_collection_name(self.collection_name, weaviate_format=True)
-        collection = self.client.collections.get(sanitized_collection_name)
-        chunk_ids = [chunk.chunk_id for chunk in chunks_for_deletion]
-        collection.data.delete_many(where=Filter.by_property("chunk_id").contains_any(chunk_ids))
-
-    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
-        """
-        Performs vector search using Weaviate's built-in vector search.
-        Args:
-            embedding: The query embedding vector
-            k: Limit of number of results to return
-            score_threshold: Minimum similarity score threshold
-        Returns:
-            QueryChunksResponse with chunks and scores.
-        """
-        log.debug(
-            f"WEAVIATE VECTOR SEARCH CALLED: embedding_shape={embedding.shape}, k={k}, threshold={score_threshold}"
-        )
-        sanitized_collection_name = sanitize_collection_name(self.collection_name, weaviate_format=True)
-        collection = self.client.collections.get(sanitized_collection_name)
-
-        try:
-            results = collection.query.near_vector(
-                near_vector=embedding.tolist(), limit=k, return_metadata=wvc.query.MetadataQuery(distance=True)
-            )
-        except Exception as e:
-            log.error(f"Weaviate client vector search failed: {e}")
-            raise
-
-        chunks = []
-        scores = []
-        for doc in results.objects:
-            chunk_json = doc.properties["chunk_content"]
-            try:
-                chunk_dict = json.loads(chunk_json)
-                chunk = Chunk(**chunk_dict)
-            except Exception:
-                log.exception(f"Failed to parse document: {chunk_json}")
-                continue
-
-            if doc.metadata.distance is None:
-                continue
-            # Convert cosine distance ∈ [0,2] -> normalized cosine similarity ∈ [0,1]
-            score = 1.0 - (float(doc.metadata.distance) / 2.0)
-            if score < score_threshold:
-                continue
-
-            chunks.append(chunk)
-            scores.append(score)
-
-        log.debug(f"WEAVIATE VECTOR SEARCH RESULTS: Found {len(chunks)} chunks with scores {scores}")
-        return QueryChunksResponse(chunks=chunks, scores=scores)
-
-    async def delete(self, chunk_ids: list[str] | None = None) -> None:
-        """
-        Delete chunks by IDs if provided, otherwise drop the entire collection.
-        """
-        sanitized_collection_name = sanitize_collection_name(self.collection_name, weaviate_format=True)
-        if chunk_ids is None:
-            # Drop entire collection if it exists
-            if self.client.collections.exists(sanitized_collection_name):
-                self.client.collections.delete(sanitized_collection_name)
-            return
-        collection = self.client.collections.get(sanitized_collection_name)
-        collection.data.delete_many(where=Filter.by_property("id").contains_any(chunk_ids))
-
-    async def query_keyword(self, query_string: str, k: int, score_threshold: float) -> QueryChunksResponse:
-        """
-        Performs BM25-based keyword search using Weaviate's built-in full-text search.
-        Args:
-            query_string: The text query for keyword search
-            k: Limit of number of results to return
-            score_threshold: Minimum similarity score threshold
-        Returns:
-            QueryChunksResponse with chunks and scores
-        """
-        log.debug(f"WEAVIATE KEYWORD SEARCH CALLED: query='{query_string}', k={k}, threshold={score_threshold}")
-        sanitized_collection_name = sanitize_collection_name(self.collection_name, weaviate_format=True)
-        collection = self.client.collections.get(sanitized_collection_name)
-
-        # Perform BM25 keyword search on chunk_content field
-        try:
-            results = collection.query.bm25(
-                query=query_string, limit=k, return_metadata=wvc.query.MetadataQuery(score=True)
-            )
-        except Exception as e:
-            log.error(f"Weaviate client keyword search failed: {e}")
-            raise
-
-        chunks = []
-        scores = []
-        for doc in results.objects:
-            chunk_json = doc.properties["chunk_content"]
-            try:
-                chunk_dict = json.loads(chunk_json)
-                chunk = Chunk(**chunk_dict)
-            except Exception:
-                log.exception(f"Failed to parse document: {chunk_json}")
-                continue
-
-            score = doc.metadata.score if doc.metadata.score is not None else 0.0
-            if score < score_threshold:
-                continue
-
-            chunks.append(chunk)
-            scores.append(score)
-
-        log.debug(f"WEAVIATE KEYWORD SEARCH RESULTS: Found {len(chunks)} chunks with scores {scores}.")
-        return QueryChunksResponse(chunks=chunks, scores=scores)
-
-    async def query_hybrid(
-        self,
-        embedding: NDArray,
-        query_string: str,
-        k: int,
-        score_threshold: float,
-        reranker_type: str,
-        reranker_params: dict[str, Any] | None = None,
-    ) -> QueryChunksResponse:
-        """
-        Hybrid search combining vector similarity and keyword search using Weaviate's native hybrid search.
-        Args:
-            embedding: The query embedding vector
-            query_string: The text query for keyword search
-            k: Limit of number of results to return
-            score_threshold: Minimum similarity score threshold
-            reranker_type: Type of reranker to use ("rrf" or "normalized")
-            reranker_params: Parameters for the reranker
-        Returns:
-            QueryChunksResponse with combined results
-        """
-        log.debug(
-            f"WEAVIATE HYBRID SEARCH CALLED: query='{query_string}', embedding_shape={embedding.shape}, k={k}, threshold={score_threshold}, reranker={reranker_type}"
-        )
-        sanitized_collection_name = sanitize_collection_name(self.collection_name, weaviate_format=True)
-        collection = self.client.collections.get(sanitized_collection_name)
-
-        # Ranked (RRF) reranker fusion type
-        if reranker_type == RERANKER_TYPE_RRF:
-            rerank = HybridFusion.RANKED
-        # Relative score (Normalized) reranker fusion type
-        else:
-            rerank = HybridFusion.RELATIVE_SCORE
-
-        # Perform hybrid search using Weaviate's native hybrid search
-        try:
-            results = collection.query.hybrid(
-                query=query_string,
-                alpha=0.5,  # Range <0, 1>, where 0.5 will equally favor vector and keyword search
-                vector=embedding.tolist(),
-                limit=k,
-                fusion_type=rerank,
-                return_metadata=wvc.query.MetadataQuery(score=True),
-            )
-        except Exception as e:
-            log.error(f"Weaviate client hybrid search failed: {e}")
-            raise
-
-        chunks = []
-        scores = []
-        for doc in results.objects:
-            chunk_json = doc.properties["chunk_content"]
-            try:
-                chunk_dict = json.loads(chunk_json)
-                chunk = Chunk(**chunk_dict)
-            except Exception:
-                log.exception(f"Failed to parse document: {chunk_json}")
-                continue
-
-            score = doc.metadata.score if doc.metadata.score is not None else 0.0
-            if score < score_threshold:
-                continue
-
-            chunks.append(chunk)
-            scores.append(score)
-
-        log.debug(f"WEAVIATE HYBRID SEARCH RESULTS: Found {len(chunks)} chunks with scores {scores}")
-        return QueryChunksResponse(chunks=chunks, scores=scores)
-
-
-class WeaviateVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, NeedsRequestProviderData, VectorStoresProtocolPrivate):
-    def __init__(self, config: WeaviateVectorIOConfig, inference_api: Inference, files_api: Files | None) -> None:
-        super().__init__(files_api=files_api, kvstore=None)
-        self.config = config
-        self.inference_api = inference_api
-        self.client_cache = {}
-        self.cache = {}
-        self.vector_store_table = None
-        self.metadata_collection_name = "openai_vector_stores_metadata"
-
-    def _get_client(self) -> weaviate.WeaviateClient:
-        if "localhost" in self.config.weaviate_cluster_url:
-            log.info("Using Weaviate locally in container")
-            host, port = self.config.weaviate_cluster_url.split(":")
-            key = "local_test"
-            client = weaviate.connect_to_local(host=host, port=port)
-        else:
-            log.info("Using Weaviate remote cluster with URL")
-            key = f"{self.config.weaviate_cluster_url}::{self.config.weaviate_api_key}"
-            if key in self.client_cache:
-                return self.client_cache[key]
-            client = weaviate.connect_to_weaviate_cloud(
-                cluster_url=self.config.weaviate_cluster_url,
-                auth_credentials=Auth.api_key(self.config.weaviate_api_key),
-            )
-        self.client_cache[key] = client
-        return client
-
-    async def initialize(self) -> None:
-        """Set up KV store and load existing vector DBs and OpenAI vector stores."""
-        # Initialize KV store for metadata if configured
-        if self.config.persistence is not None:
-            self.kvstore = await kvstore_impl(self.config.persistence)
-        else:
-            self.kvstore = None
-            log.info("No kvstore configured, registry will not persist across restarts")
-
-        # Load existing vector DB definitions
-        if self.kvstore is not None:
-            start_key = VECTOR_DBS_PREFIX
-            end_key = f"{VECTOR_DBS_PREFIX}\xff"
-            stored = await self.kvstore.values_in_range(start_key, end_key)
-            for raw in stored:
-                vector_store = VectorStore.model_validate_json(raw)
-                client = self._get_client()
-                idx = WeaviateIndex(client=client, collection_name=vector_store.identifier, kvstore=self.kvstore)
-                self.cache[vector_store.identifier] = VectorStoreWithIndex(
-                    vector_store=vector_store, index=idx, inference_api=self.inference_api
-                )
-
-            # Load OpenAI vector stores metadata into cache
-            await self.initialize_openai_vector_stores()
-
-    async def shutdown(self) -> None:
-        for client in self.client_cache.values():
-            client.close()
-        # Clean up mixin resources (file batch tasks)
-        await super().shutdown()
-
-    async def register_vector_store(self, vector_store: VectorStore) -> None:
-        client = self._get_client()
-        sanitized_collection_name = sanitize_collection_name(vector_store.identifier, weaviate_format=True)
-        # Create collection if it doesn't exist
-        if not client.collections.exists(sanitized_collection_name):
-            client.collections.create(
-                name=sanitized_collection_name,
-                vectorizer_config=wvc.config.Configure.Vectorizer.none(),
-                properties=[
-                    wvc.config.Property(name="chunk_content", data_type=wvc.config.DataType.TEXT),
-                ],
-            )
-
-        self.cache[vector_store.identifier] = VectorStoreWithIndex(
-            vector_store, WeaviateIndex(client=client, collection_name=sanitized_collection_name), self.inference_api
-        )
-
-    async def unregister_vector_store(self, vector_store_id: str) -> None:
-        client = self._get_client()
-        sanitized_collection_name = sanitize_collection_name(vector_store_id, weaviate_format=True)
-        if vector_store_id not in self.cache or client.collections.exists(sanitized_collection_name) is False:
-            return
-        client.collections.delete(sanitized_collection_name)
-        await self.cache[vector_store_id].index.delete()
-        del self.cache[vector_store_id]
-
-    async def _get_and_cache_vector_store_index(self, vector_store_id: str) -> VectorStoreWithIndex | None:
-        if vector_store_id in self.cache:
-            return self.cache[vector_store_id]
-
-        if self.vector_store_table is None:
-            raise VectorStoreNotFoundError(vector_store_id)
-
-        vector_store = await self.vector_store_table.get_vector_store(vector_store_id)
-        if not vector_store:
-            raise VectorStoreNotFoundError(vector_store_id)
-
-        client = self._get_client()
-        sanitized_collection_name = sanitize_collection_name(vector_store.identifier, weaviate_format=True)
-        if not client.collections.exists(sanitized_collection_name):
-            raise ValueError(f"Collection with name `{sanitized_collection_name}` not found")
-
-        index = VectorStoreWithIndex(
-            vector_store=vector_store,
-            index=WeaviateIndex(client=client, collection_name=vector_store.identifier),
-            inference_api=self.inference_api,
-        )
-        self.cache[vector_store_id] = index
-        return index
-
-    async def insert_chunks(self, vector_db_id: str, chunks: list[Chunk], ttl_seconds: int | None = None) -> None:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
-        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
-
-        await index.insert_chunks(chunks)
-
-    async def query_chunks(
-        self, vector_db_id: str, query: InterleavedContent, params: dict[str, Any] | None = None
-    ) -> QueryChunksResponse:
-        index = await self._get_and_cache_vector_store_index(vector_db_id)
-        if not index:
-            raise VectorStoreNotFoundError(vector_db_id)
-
-        return await index.query_chunks(query, params)
-
-    async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None:
-        index = await self._get_and_cache_vector_store_index(store_id)
-        if not index:
-            raise ValueError(f"Vector DB {store_id} not found")
-
-        await index.index.delete_chunks(chunks_for_deletion)