chore(cleanup)!: remove tool_runtime.rag_tool (#3871)

Kill the `builtin::rag` tool group completely since it is no longer targeted. We use the Responses implementation for knowledge_search which uses the `openai_vector_stores` pathway. --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
2025-12-07 18:57:21 +00:00 · 2025-10-20 22:26:21 -07:00 · 2025-10-20 22:26:21 -07:00 · 0e96279bee
commit 0e96279bee
parent 5aaf1a8bca
55 changed files with 17 additions and 3114 deletions
--- a/llama_stack/providers/inline/tool_runtime/init.py
+++ b/llama_stack/providers/inline/tool_runtime/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/inline/tool_runtime/rag/init.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/init.py
@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.providers.datatypes import Api
-
-from .config import RagToolRuntimeConfig
-
-
-async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]):
-    from .memory import MemoryToolRuntimeImpl
-
-    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference], deps[Api.files])
-    await impl.initialize()
-    return impl
--- a/llama_stack/providers/inline/tool_runtime/rag/config.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/config.py
@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from pydantic import BaseModel
-
-
-class RagToolRuntimeConfig(BaseModel):
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {}
--- a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
@ -1,77 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from jinja2 import Template
-
-from llama_stack.apis.common.content_types import InterleavedContent
-from llama_stack.apis.inference import OpenAIChatCompletionRequestWithExtraBody, OpenAIUserMessageParam
-from llama_stack.apis.tools.rag_tool import (
-    DefaultRAGQueryGeneratorConfig,
-    LLMRAGQueryGeneratorConfig,
-    RAGQueryGenerator,
-    RAGQueryGeneratorConfig,
-)
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    interleaved_content_as_str,
-)
-
-
-async def generate_rag_query(
-    config: RAGQueryGeneratorConfig,
-    content: InterleavedContent,
-    **kwargs,
-):
-    """
-    Generates a query that will be used for
-    retrieving relevant information from the memory bank.
-    """
-    if config.type == RAGQueryGenerator.default.value:
-        query = await default_rag_query_generator(config, content, **kwargs)
-    elif config.type == RAGQueryGenerator.llm.value:
-        query = await llm_rag_query_generator(config, content, **kwargs)
-    else:
-        raise NotImplementedError(f"Unsupported memory query generator {config.type}")
-    return query
-
-
-async def default_rag_query_generator(
-    config: DefaultRAGQueryGeneratorConfig,
-    content: InterleavedContent,
-    **kwargs,
-):
-    return interleaved_content_as_str(content, sep=config.separator)
-
-
-async def llm_rag_query_generator(
-    config: LLMRAGQueryGeneratorConfig,
-    content: InterleavedContent,
-    **kwargs,
-):
-    assert "inference_api" in kwargs, "LLMRAGQueryGenerator needs inference_api"
-    inference_api = kwargs["inference_api"]
-
-    messages = []
-    if isinstance(content, list):
-        messages = [interleaved_content_as_str(m) for m in content]
-    else:
-        messages = [interleaved_content_as_str(content)]
-
-    template = Template(config.template)
-    rendered_content: str = template.render({"messages": messages})
-
-    model = config.model
-    message = OpenAIUserMessageParam(content=rendered_content)
-    params = OpenAIChatCompletionRequestWithExtraBody(
-        model=model,
-        messages=[message],
-        stream=False,
-    )
-    response = await inference_api.openai_chat_completion(params)
-
-    query = response.choices[0].message.content
-
-    return query
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -1,332 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import asyncio
-import base64
-import io
-import mimetypes
-from typing import Any
-
-import httpx
-from fastapi import UploadFile
-from pydantic import TypeAdapter
-
-from llama_stack.apis.common.content_types import (
-    URL,
-    InterleavedContent,
-    InterleavedContentItem,
-    TextContentItem,
-)
-from llama_stack.apis.files import Files, OpenAIFilePurpose
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.tools import (
-    ListToolDefsResponse,
-    RAGDocument,
-    RAGQueryConfig,
-    RAGQueryResult,
-    RAGToolRuntime,
-    ToolDef,
-    ToolGroup,
-    ToolInvocationResult,
-    ToolRuntime,
-)
-from llama_stack.apis.vector_io import (
-    QueryChunksResponse,
-    VectorIO,
-    VectorStoreChunkingStrategyStatic,
-    VectorStoreChunkingStrategyStaticConfig,
-)
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
-from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
-from llama_stack.providers.utils.memory.vector_store import parse_data_url
-
-from .config import RagToolRuntimeConfig
-from .context_retriever import generate_rag_query
-
-log = get_logger(name=__name__, category="tool_runtime")
-
-
-async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]:
-    """Get raw binary data and mime type from a RAGDocument for file upload."""
-    if isinstance(doc.content, URL):
-        if doc.content.uri.startswith("data:"):
-            parts = parse_data_url(doc.content.uri)
-            mime_type = parts["mimetype"]
-            data = parts["data"]
-
-            if parts["is_base64"]:
-                file_data = base64.b64decode(data)
-            else:
-                file_data = data.encode("utf-8")
-
-            return file_data, mime_type
-        else:
-            async with httpx.AsyncClient() as client:
-                r = await client.get(doc.content.uri)
-                r.raise_for_status()
-                mime_type = r.headers.get("content-type", "application/octet-stream")
-                return r.content, mime_type
-    else:
-        if isinstance(doc.content, str):
-            content_str = doc.content
-        else:
-            content_str = interleaved_content_as_str(doc.content)
-
-        if content_str.startswith("data:"):
-            parts = parse_data_url(content_str)
-            mime_type = parts["mimetype"]
-            data = parts["data"]
-
-            if parts["is_base64"]:
-                file_data = base64.b64decode(data)
-            else:
-                file_data = data.encode("utf-8")
-
-            return file_data, mime_type
-        else:
-            return content_str.encode("utf-8"), "text/plain"
-
-
-class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRuntime):
-    def __init__(
-        self,
-        config: RagToolRuntimeConfig,
-        vector_io_api: VectorIO,
-        inference_api: Inference,
-        files_api: Files,
-    ):
-        self.config = config
-        self.vector_io_api = vector_io_api
-        self.inference_api = inference_api
-        self.files_api = files_api
-
-    async def initialize(self):
-        pass
-
-    async def shutdown(self):
-        pass
-
-    async def register_toolgroup(self, toolgroup: ToolGroup) -> None:
-        pass
-
-    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
-        return
-
-    async def insert(
-        self,
-        documents: list[RAGDocument],
-        vector_db_id: str,
-        chunk_size_in_tokens: int = 512,
-    ) -> None:
-        if not documents:
-            return
-
-        for doc in documents:
-            try:
-                try:
-                    file_data, mime_type = await raw_data_from_doc(doc)
-                except Exception as e:
-                    log.error(f"Failed to extract content from document {doc.document_id}: {e}")
-                    continue
-
-                file_extension = mimetypes.guess_extension(mime_type) or ".txt"
-                filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
-
-                file_obj = io.BytesIO(file_data)
-                file_obj.name = filename
-
-                upload_file = UploadFile(file=file_obj, filename=filename)
-
-                try:
-                    created_file = await self.files_api.openai_upload_file(
-                        file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
-                    )
-                except Exception as e:
-                    log.error(f"Failed to upload file for document {doc.document_id}: {e}")
-                    continue
-
-                chunking_strategy = VectorStoreChunkingStrategyStatic(
-                    static=VectorStoreChunkingStrategyStaticConfig(
-                        max_chunk_size_tokens=chunk_size_in_tokens,
-                        chunk_overlap_tokens=chunk_size_in_tokens // 4,
-                    )
-                )
-
-                try:
-                    await self.vector_io_api.openai_attach_file_to_vector_store(
-                        vector_store_id=vector_db_id,
-                        file_id=created_file.id,
-                        attributes=doc.metadata,
-                        chunking_strategy=chunking_strategy,
-                    )
-                except Exception as e:
-                    log.error(
-                        f"Failed to attach file {created_file.id} to vector store {vector_db_id} for document {doc.document_id}: {e}"
-                    )
-                    continue
-
-            except Exception as e:
-                log.error(f"Unexpected error processing document {doc.document_id}: {e}")
-                continue
-
-    async def query(
-        self,
-        content: InterleavedContent,
-        vector_db_ids: list[str],
-        query_config: RAGQueryConfig | None = None,
-    ) -> RAGQueryResult:
-        if not vector_db_ids:
-            raise ValueError(
-                "No vector DBs were provided to the knowledge search tool. Please provide at least one vector DB ID."
-            )
-
-        query_config = query_config or RAGQueryConfig()
-        query = await generate_rag_query(
-            query_config.query_generator_config,
-            content,
-            inference_api=self.inference_api,
-        )
-        tasks = [
-            self.vector_io_api.query_chunks(
-                vector_db_id=vector_db_id,
-                query=query,
-                params={
-                    "mode": query_config.mode,
-                    "max_chunks": query_config.max_chunks,
-                    "score_threshold": 0.0,
-                    "ranker": query_config.ranker,
-                },
-            )
-            for vector_db_id in vector_db_ids
-        ]
-        results: list[QueryChunksResponse] = await asyncio.gather(*tasks)
-
-        chunks = []
-        scores = []
-
-        for vector_db_id, result in zip(vector_db_ids, results, strict=False):
-            for chunk, score in zip(result.chunks, result.scores, strict=False):
-                if not hasattr(chunk, "metadata") or chunk.metadata is None:
-                    chunk.metadata = {}
-                chunk.metadata["vector_db_id"] = vector_db_id
-
-                chunks.append(chunk)
-                scores.append(score)
-
-        if not chunks:
-            return RAGQueryResult(content=None)
-
-        # sort by score
-        chunks, scores = zip(*sorted(zip(chunks, scores, strict=False), key=lambda x: x[1], reverse=True), strict=False)  # type: ignore
-        chunks = chunks[: query_config.max_chunks]
-
-        tokens = 0
-        picked: list[InterleavedContentItem] = [
-            TextContentItem(
-                text=f"knowledge_search tool found {len(chunks)} chunks:\nBEGIN of knowledge_search tool results.\n"
-            )
-        ]
-        for i, chunk in enumerate(chunks):
-            metadata = chunk.metadata
-            tokens += metadata.get("token_count", 0)
-            tokens += metadata.get("metadata_token_count", 0)
-
-            if tokens > query_config.max_tokens_in_context:
-                log.error(
-                    f"Using {len(picked)} chunks; reached max tokens in context: {tokens}",
-                )
-                break
-
-            # Add useful keys from chunk_metadata to metadata and remove some from metadata
-            chunk_metadata_keys_to_include_from_context = [
-                "chunk_id",
-                "document_id",
-                "source",
-            ]
-            metadata_keys_to_exclude_from_context = [
-                "token_count",
-                "metadata_token_count",
-                "vector_db_id",
-            ]
-            metadata_for_context = {}
-            for k in chunk_metadata_keys_to_include_from_context:
-                metadata_for_context[k] = getattr(chunk.chunk_metadata, k)
-            for k in metadata:
-                if k not in metadata_keys_to_exclude_from_context:
-                    metadata_for_context[k] = metadata[k]
-
-            text_content = query_config.chunk_template.format(index=i + 1, chunk=chunk, metadata=metadata_for_context)
-            picked.append(TextContentItem(text=text_content))
-
-        picked.append(TextContentItem(text="END of knowledge_search tool results.\n"))
-        picked.append(
-            TextContentItem(
-                text=f'The above results were retrieved to help answer the user\'s query: "{interleaved_content_as_str(content)}". Use them as supporting information only in answering this query.\n',
-            )
-        )
-
-        return RAGQueryResult(
-            content=picked,
-            metadata={
-                "document_ids": [c.document_id for c in chunks[: len(picked)]],
-                "chunks": [c.content for c in chunks[: len(picked)]],
-                "scores": scores[: len(picked)],
-                "vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]],
-            },
-        )
-
-    async def list_runtime_tools(
-        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
-    ) -> ListToolDefsResponse:
-        # Parameters are not listed since these methods are not yet invoked automatically
-        # by the LLM. The method is only implemented so things like /tools can list without
-        # encountering fatals.
-        return ListToolDefsResponse(
-            data=[
-                ToolDef(
-                    name="insert_into_memory",
-                    description="Insert documents into memory",
-                ),
-                ToolDef(
-                    name="knowledge_search",
-                    description="Search for information in a database.",
-                    input_schema={
-                        "type": "object",
-                        "properties": {
-                            "query": {
-                                "type": "string",
-                                "description": "The query to search for. Can be a natural language sentence or keywords.",
-                            }
-                        },
-                        "required": ["query"],
-                    },
-                ),
-            ]
-        )
-
-    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
-        vector_db_ids = kwargs.get("vector_db_ids", [])
-        query_config = kwargs.get("query_config")
-        if query_config:
-            query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config)
-        else:
-            query_config = RAGQueryConfig()
-
-        query = kwargs["query"]
-        result = await self.query(
-            content=query,
-            vector_db_ids=vector_db_ids,
-            query_config=query_config,
-        )
-
-        return ToolInvocationResult(
-            content=result.content or [],
-            metadata={
-                **(result.metadata or {}),
-                "citation_files": getattr(result, "citation_files", None),
-            },
-        )