feat: remove special handling of builtin::rag tool (#1015)

Summary: Lets the model decide which tool it needs to call to respond to a query. Test Plan: ``` LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/client-sdk/ --safety-shield meta-llama/Llama-Guard-3-8B ``` Also evaluated on a small benchmark with 20 questions from HotpotQA. With this PR and some prompting, the performance is 77% recall compared to 50% currently. --- [//]: # (BEGIN SAPLING FOOTER) Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/meta-llama/llama-stack/pull/1015). * #1268 * #1239 * __->__ #1015
2025-12-03 09:53:45 +00:00 · 2025-02-26 13:04:52 -08:00 · 2025-02-26 13:04:52 -08:00 · bb2690f176
commit bb2690f176
parent c64f0d5888
4 changed files with 94 additions and 133 deletions
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -17,7 +17,6 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
 from urllib.parse import urlparse

 import httpx
-from pydantic import TypeAdapter

 from llama_stack.apis.agents import (
    AgentConfig,
@ -62,7 +61,7 @@ from llama_stack.apis.inference import (
    UserMessage,
 )
 from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import RAGDocument, RAGQueryConfig, ToolGroups, ToolInvocationResult, ToolRuntime
+from llama_stack.apis.tools import RAGDocument, ToolGroups, ToolInvocationResult, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
@ -70,7 +69,6 @@ from llama_stack.models.llama.datatypes import (
    ToolParamDefinition,
 )
 from llama_stack.providers.utils.kvstore import KVStore
-from llama_stack.providers.utils.memory.vector_store import concat_interleaved_content
 from llama_stack.providers.utils.telemetry import tracing

 from .persistence import AgentPersistence
@ -84,7 +82,7 @@ def make_random_string(length: int = 8):


 TOOLS_ATTACHMENT_KEY_REGEX = re.compile(r"__tools_attachment__=(\{.*?\})")
-MEMORY_QUERY_TOOL = "query_from_memory"
+MEMORY_QUERY_TOOL = "knowledge_search"
 WEB_SEARCH_TOOL = "web_search"
 RAG_TOOL_GROUP = "builtin::rag"

@ -517,93 +515,6 @@ class ChatAgent(ShieldRunnerMixin):
        if documents:
            await self.handle_documents(session_id, documents, input_messages, tool_defs)

-        if RAG_TOOL_GROUP in toolgroups and len(input_messages) > 0:
-            with tracing.span(MEMORY_QUERY_TOOL) as span:
-                step_id = str(uuid.uuid4())
-                yield AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseStepStartPayload(
-                            step_type=StepType.tool_execution.value,
-                            step_id=step_id,
-                        )
-                    )
-                )
-
-                args = toolgroup_args.get(RAG_TOOL_GROUP, {})
-                vector_db_ids = args.get("vector_db_ids", [])
-                query_config = args.get("query_config")
-                if query_config:
-                    query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config)
-                else:
-                    # handle someone passing an empty dict
-                    query_config = RAGQueryConfig()
-
-                session_info = await self.storage.get_session_info(session_id)
-
-                # if the session has a memory bank id, let the memory tool use it
-                if session_info.vector_db_id:
-                    vector_db_ids.append(session_info.vector_db_id)
-
-                yield AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseStepProgressPayload(
-                            step_type=StepType.tool_execution.value,
-                            step_id=step_id,
-                            delta=ToolCallDelta(
-                                parse_status=ToolCallParseStatus.succeeded,
-                                tool_call=ToolCall(
-                                    call_id="",
-                                    tool_name=MEMORY_QUERY_TOOL,
-                                    arguments={},
-                                ),
-                            ),
-                        )
-                    )
-                )
-                result = await self.tool_runtime_api.rag_tool.query(
-                    content=concat_interleaved_content([msg.content for msg in input_messages]),
-                    vector_db_ids=vector_db_ids,
-                    query_config=query_config,
-                )
-                retrieved_context = result.content
-
-                yield AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseStepCompletePayload(
-                            step_type=StepType.tool_execution.value,
-                            step_id=step_id,
-                            step_details=ToolExecutionStep(
-                                step_id=step_id,
-                                turn_id=turn_id,
-                                tool_calls=[
-                                    ToolCall(
-                                        call_id="",
-                                        tool_name=MEMORY_QUERY_TOOL,
-                                        arguments={},
-                                    )
-                                ],
-                                tool_responses=[
-                                    ToolResponse(
-                                        call_id="",
-                                        tool_name=MEMORY_QUERY_TOOL,
-                                        content=retrieved_context or [],
-                                        metadata=result.metadata,
-                                    )
-                                ],
-                            ),
-                        )
-                    )
-                )
-                span.set_attribute("input", [m.model_dump_json() for m in input_messages])
-                span.set_attribute("output", retrieved_context)
-                span.set_attribute("tool_name", MEMORY_QUERY_TOOL)
-
-                # append retrieved_context to the last user message
-                for message in input_messages[::-1]:
-                    if isinstance(message, UserMessage):
-                        message.context = retrieved_context
-                        break
-
        output_attachments = []

        n_iter = 0
@ -631,9 +542,7 @@ class ChatAgent(ShieldRunnerMixin):
                async for chunk in await self.inference_api.chat_completion(
                    self.agent_config.model,
                    input_messages,
-                    tools=[
-                        tool for tool in tool_defs.values() if tool_to_group.get(tool.tool_name, None) != RAG_TOOL_GROUP
-                    ],
+                    tools=[tool for tool in tool_defs.values()],
                    tool_prompt_format=self.agent_config.tool_config.tool_prompt_format,
                    response_format=self.agent_config.response_format,
                    stream=True,
@ -845,8 +754,9 @@ class ChatAgent(ShieldRunnerMixin):

                # TODO: add tool-input touchpoint and a "start" event for this step also
                # but that needs a lot more refactoring of Tool code potentially
-
-                if out_attachment := _interpret_content_as_attachment(result_message.content):
+                if (type(result_message.content) is str) and (
+                    out_attachment := _interpret_content_as_attachment(result_message.content)
+                ):
                    # NOTE: when we push this message back to the model, the model may ignore the
                    # attached file path etc. since the model is trained to only provide a user message
                    # with the summary. We keep all generated attachments and then attach them to final message
@ -1072,7 +982,11 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa
        else:
            raise ValueError(f"Unsupported URL {url}")

-        content.append(TextContentItem(text=f'# There is a file accessible to you at "{filepath}"\n'))
+        content.append(
+            TextContentItem(
+                text=f'# User provided a file accessible to you at "{filepath}"\nYou can use code_interpreter to load and inspect it.'
+            )
+        )

    return ToolResponseMessage(
        call_id="",