From 0e96279beee6627e9447aaa8d30a169403046e84 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Mon, 20 Oct 2025 22:26:21 -0700
Subject: [PATCH] chore(cleanup)!: remove tool_runtime.rag_tool (#3871)

Kill the `builtin::rag` tool group completely since it is no longer
targeted. We use the Responses implementation for knowledge_search which
uses the `openai_vector_stores` pathway.

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 client-sdks/stainless/openapi.yml             | 331 --------------
 .../self_hosted_distro/meta-reference-gpu.md  |   2 +-
 .../self_hosted_distro/nvidia.md              |   2 +-
 .../providers/vector_io/inline_sqlite-vec.mdx |   4 +-
 .../openapi_generator/pyopenapi/operations.py |   6 -
 docs/static/llama-stack-spec.html             | 423 ------------------
 docs/static/llama-stack-spec.yaml             | 331 --------------
 docs/static/stainless-llama-stack-spec.html   | 423 ------------------
 docs/static/stainless-llama-stack-spec.yaml   | 331 --------------
 llama_stack/apis/tools/__init__.py            |   1 -
 llama_stack/apis/tools/rag_tool.py            | 218 ---------
 llama_stack/apis/tools/tools.py               |  14 -
 llama_stack/core/routers/tool_runtime.py      |  45 +-
 llama_stack/core/server/routes.py             |  18 -
 llama_stack/core/stack.py                     |   3 +-
 llama_stack/distributions/ci-tests/build.yaml |   1 -
 llama_stack/distributions/ci-tests/run.yaml   |   4 -
 llama_stack/distributions/dell/build.yaml     |   1 -
 llama_stack/distributions/dell/dell.py        |   5 -
 .../distributions/dell/run-with-safety.yaml   |   4 -
 llama_stack/distributions/dell/run.yaml       |   4 -
 .../meta-reference-gpu/build.yaml             |   1 -
 .../meta-reference-gpu/meta_reference.py      |   5 -
 .../meta-reference-gpu/run-with-safety.yaml   |   4 -
 .../distributions/meta-reference-gpu/run.yaml |   4 -
 llama_stack/distributions/nvidia/build.yaml   |   3 +-
 llama_stack/distributions/nvidia/nvidia.py    |   9 +-
 .../distributions/nvidia/run-with-safety.yaml |   8 +-
 llama_stack/distributions/nvidia/run.yaml     |   8 +-
 .../distributions/open-benchmark/build.yaml   |   1 -
 .../open-benchmark/open_benchmark.py          |   5 -
 .../distributions/open-benchmark/run.yaml     |   4 -
 .../distributions/postgres-demo/build.yaml    |   1 -
 .../postgres-demo/postgres_demo.py            |   5 -
 .../distributions/postgres-demo/run.yaml      |   4 -
 .../distributions/starter-gpu/build.yaml      |   1 -
 .../distributions/starter-gpu/run.yaml        |   4 -
 llama_stack/distributions/starter/build.yaml  |   1 -
 llama_stack/distributions/starter/run.yaml    |   4 -
 llama_stack/distributions/starter/starter.py  |   5 -
 llama_stack/distributions/watsonx/build.yaml  |   1 -
 llama_stack/distributions/watsonx/run.yaml    |   4 -
 llama_stack/distributions/watsonx/watsonx.py  |   5 -
 .../providers/inline/tool_runtime/__init__.py |   5 -
 .../inline/tool_runtime/rag/__init__.py       |  19 -
 .../inline/tool_runtime/rag/config.py         |  15 -
 .../tool_runtime/rag/context_retriever.py     |  77 ----
 .../inline/tool_runtime/rag/memory.py         | 332 --------------
 llama_stack/providers/registry/inference.py   |   1 +
 .../providers/registry/tool_runtime.py        |  20 -
 llama_stack/providers/registry/vector_io.py   |   2 +-
 .../providers/utils/memory/vector_store.py    |  28 --
 .../utils/memory/test_vector_store.py         | 169 +------
 tests/unit/rag/test_rag_query.py              | 138 ------
 tests/unit/rag/test_vector_store.py           |  67 ---
 55 files changed, 17 insertions(+), 3114 deletions(-)
 delete mode 100644 llama_stack/apis/tools/rag_tool.py
 delete mode 100644 llama_stack/providers/inline/tool_runtime/__init__.py
 delete mode 100644 llama_stack/providers/inline/tool_runtime/rag/__init__.py
 delete mode 100644 llama_stack/providers/inline/tool_runtime/rag/config.py
 delete mode 100644 llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
 delete mode 100644 llama_stack/providers/inline/tool_runtime/rag/memory.py
 delete mode 100644 tests/unit/rag/test_rag_query.py

diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
index 93049a14a..98a309f12 100644
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@@ -2039,69 +2039,6 @@ paths:
           schema:
             $ref: '#/components/schemas/URL'
       deprecated: false
-  /v1/tool-runtime/rag-tool/insert:
-    post:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Index documents so they can be used by the RAG system.
-      description: >-
-        Index documents so they can be used by the RAG system.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/InsertRequest'
-        required: true
-      deprecated: false
-  /v1/tool-runtime/rag-tool/query:
-    post:
-      responses:
-        '200':
-          description: >-
-            RAGQueryResult containing the retrieved content and metadata
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/RAGQueryResult'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Query the RAG system for context; typically invoked by the agent.
-      description: >-
-        Query the RAG system for context; typically invoked by the agent.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/QueryRequest'
-        required: true
-      deprecated: false
   /v1/toolgroups:
     get:
       responses:
@@ -9921,274 +9858,6 @@ components:
       title: ListToolDefsResponse
       description: >-
         Response containing a list of tool definitions.
-    RAGDocument:
-      type: object
-      properties:
-        document_id:
-          type: string
-          description: The unique identifier for the document.
-        content:
-          oneOf:
-            - type: string
-            - $ref: '#/components/schemas/InterleavedContentItem'
-            - type: array
-              items:
-                $ref: '#/components/schemas/InterleavedContentItem'
-            - $ref: '#/components/schemas/URL'
-          description: The content of the document.
-        mime_type:
-          type: string
-          description: The MIME type of the document.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Additional metadata for the document.
-      additionalProperties: false
-      required:
-        - document_id
-        - content
-        - metadata
-      title: RAGDocument
-      description: >-
-        A document to be used for document ingestion in the RAG Tool.
-    InsertRequest:
-      type: object
-      properties:
-        documents:
-          type: array
-          items:
-            $ref: '#/components/schemas/RAGDocument'
-          description: >-
-            List of documents to index in the RAG system
-        vector_db_id:
-          type: string
-          description: >-
-            ID of the vector database to store the document embeddings
-        chunk_size_in_tokens:
-          type: integer
-          description: >-
-            (Optional) Size in tokens for document chunking during indexing
-      additionalProperties: false
-      required:
-        - documents
-        - vector_db_id
-        - chunk_size_in_tokens
-      title: InsertRequest
-    DefaultRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: default
-          default: default
-          description: >-
-            Type of query generator, always 'default'
-        separator:
-          type: string
-          default: ' '
-          description: >-
-            String separator used to join query terms
-      additionalProperties: false
-      required:
-        - type
-        - separator
-      title: DefaultRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the default RAG query generator.
-    LLMRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm
-          default: llm
-          description: Type of query generator, always 'llm'
-        model:
-          type: string
-          description: >-
-            Name of the language model to use for query generation
-        template:
-          type: string
-          description: >-
-            Template string for formatting the query generation prompt
-      additionalProperties: false
-      required:
-        - type
-        - model
-        - template
-      title: LLMRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the LLM-based RAG query generator.
-    RAGQueryConfig:
-      type: object
-      properties:
-        query_generator_config:
-          oneOf:
-            - $ref: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-            - $ref: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          discriminator:
-            propertyName: type
-            mapping:
-              default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-              llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          description: Configuration for the query generator.
-        max_tokens_in_context:
-          type: integer
-          default: 4096
-          description: Maximum number of tokens in the context.
-        max_chunks:
-          type: integer
-          default: 5
-          description: Maximum number of chunks to retrieve.
-        chunk_template:
-          type: string
-          default: >
-            Result {index}
-
-            Content: {chunk.content}
-
-            Metadata: {metadata}
-          description: >-
-            Template for formatting each retrieved chunk in the context. Available
-            placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk
-            content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
-            {chunk.content}\nMetadata: {metadata}\n"
-        mode:
-          $ref: '#/components/schemas/RAGSearchMode'
-          default: vector
-          description: >-
-            Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
-            "vector".
-        ranker:
-          $ref: '#/components/schemas/Ranker'
-          description: >-
-            Configuration for the ranker to use in hybrid search. Defaults to RRF
-            ranker.
-      additionalProperties: false
-      required:
-        - query_generator_config
-        - max_tokens_in_context
-        - max_chunks
-        - chunk_template
-      title: RAGQueryConfig
-      description: >-
-        Configuration for the RAG query generation.
-    RAGSearchMode:
-      type: string
-      enum:
-        - vector
-        - keyword
-        - hybrid
-      title: RAGSearchMode
-      description: >-
-        Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search
-        for semantic matching - KEYWORD: Uses keyword-based search for exact matching
-        - HYBRID: Combines both vector and keyword search for better results
-    RRFRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: rrf
-          default: rrf
-          description: The type of ranker, always "rrf"
-        impact_factor:
-          type: number
-          default: 60.0
-          description: >-
-            The impact factor for RRF scoring. Higher values give more weight to higher-ranked
-            results. Must be greater than 0
-      additionalProperties: false
-      required:
-        - type
-        - impact_factor
-      title: RRFRanker
-      description: >-
-        Reciprocal Rank Fusion (RRF) ranker configuration.
-    Ranker:
-      oneOf:
-        - $ref: '#/components/schemas/RRFRanker'
-        - $ref: '#/components/schemas/WeightedRanker'
-      discriminator:
-        propertyName: type
-        mapping:
-          rrf: '#/components/schemas/RRFRanker'
-          weighted: '#/components/schemas/WeightedRanker'
-    WeightedRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: weighted
-          default: weighted
-          description: The type of ranker, always "weighted"
-        alpha:
-          type: number
-          default: 0.5
-          description: >-
-            Weight factor between 0 and 1. 0 means only use keyword scores, 1 means
-            only use vector scores, values in between blend both scores.
-      additionalProperties: false
-      required:
-        - type
-        - alpha
-      title: WeightedRanker
-      description: >-
-        Weighted ranker configuration that combines vector and keyword scores.
-    QueryRequest:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The query content to search for in the indexed documents
-        vector_db_ids:
-          type: array
-          items:
-            type: string
-          description: >-
-            List of vector database IDs to search within
-        query_config:
-          $ref: '#/components/schemas/RAGQueryConfig'
-          description: >-
-            (Optional) Configuration parameters for the query operation
-      additionalProperties: false
-      required:
-        - content
-        - vector_db_ids
-      title: QueryRequest
-    RAGQueryResult:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            (Optional) The retrieved content from the query
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            Additional metadata about the query result
-      additionalProperties: false
-      required:
-        - metadata
-      title: RAGQueryResult
-      description: >-
-        Result of a RAG query containing retrieved content and metadata.
     ToolGroup:
       type: object
       properties:
diff --git a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
index b7134b3e1..666850976 100644
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -21,7 +21,7 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
 | inference | `inline::meta-reference` |
 | safety | `inline::llama-guard` |
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
-| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::model-context-protocol` |
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
diff --git a/docs/docs/distributions/self_hosted_distro/nvidia.md b/docs/docs/distributions/self_hosted_distro/nvidia.md
index 4a7d99ff5..b1de9ddb8 100644
--- a/docs/docs/distributions/self_hosted_distro/nvidia.md
+++ b/docs/docs/distributions/self_hosted_distro/nvidia.md
@@ -16,7 +16,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
 | scoring | `inline::basic` |
-| tool_runtime | `inline::rag-runtime` |
+| tool_runtime |  |
 | vector_io | `inline::faiss` |
 
 
diff --git a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
index 98a372250..459498a59 100644
--- a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
@@ -28,7 +28,7 @@ description: |
   #### Empirical Example
 
   Consider the histogram below in which 10,000 randomly generated strings were inserted
-  in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
+  in batches of 100 into both Faiss and sqlite-vec.
 
   ```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
   :alt: Comparison of SQLite-Vec and Faiss write times
@@ -233,7 +233,7 @@ Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, i
 #### Empirical Example
 
 Consider the histogram below in which 10,000 randomly generated strings were inserted
-in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
+in batches of 100 into both Faiss and sqlite-vec.
 
 ```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
 :alt: Comparison of SQLite-Vec and Faiss write times
diff --git a/docs/openapi_generator/pyopenapi/operations.py b/docs/openapi_generator/pyopenapi/operations.py
index 2970d7e53..e5f33f13d 100644
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@@ -196,16 +196,10 @@ def _get_endpoint_functions(
 def _get_defining_class(member_fn: str, derived_cls: type) -> type:
     "Find the class in which a member function is first defined in a class inheritance hierarchy."
 
-    # This import must be dynamic here
-    from llama_stack.apis.tools import RAGToolRuntime, ToolRuntime
-
     # iterate in reverse member resolution order to find most specific class first
     for cls in reversed(inspect.getmro(derived_cls)):
         for name, _ in inspect.getmembers(cls, inspect.isfunction):
             if name == member_fn:
-                # HACK ALERT
-                if cls == RAGToolRuntime:
-                    return ToolRuntime
                 return cls
 
     raise ValidationError(
diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html
index 61deaec1e..7dfb2ed13 100644
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@@ -2624,89 +2624,6 @@
                 "deprecated": false
             }
         },
-        "/v1/tool-runtime/rag-tool/insert": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "ToolRuntime"
-                ],
-                "summary": "Index documents so they can be used by the RAG system.",
-                "description": "Index documents so they can be used by the RAG system.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/InsertRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                },
-                "deprecated": false
-            }
-        },
-        "/v1/tool-runtime/rag-tool/query": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "RAGQueryResult containing the retrieved content and metadata",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/RAGQueryResult"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "ToolRuntime"
-                ],
-                "summary": "Query the RAG system for context; typically invoked by the agent.",
-                "description": "Query the RAG system for context; typically invoked by the agent.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/QueryRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                },
-                "deprecated": false
-            }
-        },
         "/v1/toolgroups": {
             "get": {
                 "responses": {
@@ -11383,346 +11300,6 @@
                 "title": "ListToolDefsResponse",
                 "description": "Response containing a list of tool definitions."
             },
-            "RAGDocument": {
-                "type": "object",
-                "properties": {
-                    "document_id": {
-                        "type": "string",
-                        "description": "The unique identifier for the document."
-                    },
-                    "content": {
-                        "oneOf": [
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "$ref": "#/components/schemas/InterleavedContentItem"
-                            },
-                            {
-                                "type": "array",
-                                "items": {
-                                    "$ref": "#/components/schemas/InterleavedContentItem"
-                                }
-                            },
-                            {
-                                "$ref": "#/components/schemas/URL"
-                            }
-                        ],
-                        "description": "The content of the document."
-                    },
-                    "mime_type": {
-                        "type": "string",
-                        "description": "The MIME type of the document."
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "Additional metadata for the document."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "document_id",
-                    "content",
-                    "metadata"
-                ],
-                "title": "RAGDocument",
-                "description": "A document to be used for document ingestion in the RAG Tool."
-            },
-            "InsertRequest": {
-                "type": "object",
-                "properties": {
-                    "documents": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/RAGDocument"
-                        },
-                        "description": "List of documents to index in the RAG system"
-                    },
-                    "vector_db_id": {
-                        "type": "string",
-                        "description": "ID of the vector database to store the document embeddings"
-                    },
-                    "chunk_size_in_tokens": {
-                        "type": "integer",
-                        "description": "(Optional) Size in tokens for document chunking during indexing"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "documents",
-                    "vector_db_id",
-                    "chunk_size_in_tokens"
-                ],
-                "title": "InsertRequest"
-            },
-            "DefaultRAGQueryGeneratorConfig": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "default",
-                        "default": "default",
-                        "description": "Type of query generator, always 'default'"
-                    },
-                    "separator": {
-                        "type": "string",
-                        "default": " ",
-                        "description": "String separator used to join query terms"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "separator"
-                ],
-                "title": "DefaultRAGQueryGeneratorConfig",
-                "description": "Configuration for the default RAG query generator."
-            },
-            "LLMRAGQueryGeneratorConfig": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "llm",
-                        "default": "llm",
-                        "description": "Type of query generator, always 'llm'"
-                    },
-                    "model": {
-                        "type": "string",
-                        "description": "Name of the language model to use for query generation"
-                    },
-                    "template": {
-                        "type": "string",
-                        "description": "Template string for formatting the query generation prompt"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "model",
-                    "template"
-                ],
-                "title": "LLMRAGQueryGeneratorConfig",
-                "description": "Configuration for the LLM-based RAG query generator."
-            },
-            "RAGQueryConfig": {
-                "type": "object",
-                "properties": {
-                    "query_generator_config": {
-                        "oneOf": [
-                            {
-                                "$ref": "#/components/schemas/DefaultRAGQueryGeneratorConfig"
-                            },
-                            {
-                                "$ref": "#/components/schemas/LLMRAGQueryGeneratorConfig"
-                            }
-                        ],
-                        "discriminator": {
-                            "propertyName": "type",
-                            "mapping": {
-                                "default": "#/components/schemas/DefaultRAGQueryGeneratorConfig",
-                                "llm": "#/components/schemas/LLMRAGQueryGeneratorConfig"
-                            }
-                        },
-                        "description": "Configuration for the query generator."
-                    },
-                    "max_tokens_in_context": {
-                        "type": "integer",
-                        "default": 4096,
-                        "description": "Maximum number of tokens in the context."
-                    },
-                    "max_chunks": {
-                        "type": "integer",
-                        "default": 5,
-                        "description": "Maximum number of chunks to retrieve."
-                    },
-                    "chunk_template": {
-                        "type": "string",
-                        "default": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
-                        "description": "Template for formatting each retrieved chunk in the context. Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict). Default: \"Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n\""
-                    },
-                    "mode": {
-                        "$ref": "#/components/schemas/RAGSearchMode",
-                        "default": "vector",
-                        "description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"."
-                    },
-                    "ranker": {
-                        "$ref": "#/components/schemas/Ranker",
-                        "description": "Configuration for the ranker to use in hybrid search. Defaults to RRF ranker."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "query_generator_config",
-                    "max_tokens_in_context",
-                    "max_chunks",
-                    "chunk_template"
-                ],
-                "title": "RAGQueryConfig",
-                "description": "Configuration for the RAG query generation."
-            },
-            "RAGSearchMode": {
-                "type": "string",
-                "enum": [
-                    "vector",
-                    "keyword",
-                    "hybrid"
-                ],
-                "title": "RAGSearchMode",
-                "description": "Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search for semantic matching - KEYWORD: Uses keyword-based search for exact matching - HYBRID: Combines both vector and keyword search for better results"
-            },
-            "RRFRanker": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "rrf",
-                        "default": "rrf",
-                        "description": "The type of ranker, always \"rrf\""
-                    },
-                    "impact_factor": {
-                        "type": "number",
-                        "default": 60.0,
-                        "description": "The impact factor for RRF scoring. Higher values give more weight to higher-ranked results. Must be greater than 0"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "impact_factor"
-                ],
-                "title": "RRFRanker",
-                "description": "Reciprocal Rank Fusion (RRF) ranker configuration."
-            },
-            "Ranker": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/RRFRanker"
-                    },
-                    {
-                        "$ref": "#/components/schemas/WeightedRanker"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "rrf": "#/components/schemas/RRFRanker",
-                        "weighted": "#/components/schemas/WeightedRanker"
-                    }
-                }
-            },
-            "WeightedRanker": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "weighted",
-                        "default": "weighted",
-                        "description": "The type of ranker, always \"weighted\""
-                    },
-                    "alpha": {
-                        "type": "number",
-                        "default": 0.5,
-                        "description": "Weight factor between 0 and 1. 0 means only use keyword scores, 1 means only use vector scores, values in between blend both scores."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "alpha"
-                ],
-                "title": "WeightedRanker",
-                "description": "Weighted ranker configuration that combines vector and keyword scores."
-            },
-            "QueryRequest": {
-                "type": "object",
-                "properties": {
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The query content to search for in the indexed documents"
-                    },
-                    "vector_db_ids": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        },
-                        "description": "List of vector database IDs to search within"
-                    },
-                    "query_config": {
-                        "$ref": "#/components/schemas/RAGQueryConfig",
-                        "description": "(Optional) Configuration parameters for the query operation"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "content",
-                    "vector_db_ids"
-                ],
-                "title": "QueryRequest"
-            },
-            "RAGQueryResult": {
-                "type": "object",
-                "properties": {
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "(Optional) The retrieved content from the query"
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "Additional metadata about the query result"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "metadata"
-                ],
-                "title": "RAGQueryResult",
-                "description": "Result of a RAG query containing retrieved content and metadata."
-            },
             "ToolGroup": {
                 "type": "object",
                 "properties": {
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index c6197b36f..1b0fefe55 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -2036,69 +2036,6 @@ paths:
           schema:
             $ref: '#/components/schemas/URL'
       deprecated: false
-  /v1/tool-runtime/rag-tool/insert:
-    post:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Index documents so they can be used by the RAG system.
-      description: >-
-        Index documents so they can be used by the RAG system.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/InsertRequest'
-        required: true
-      deprecated: false
-  /v1/tool-runtime/rag-tool/query:
-    post:
-      responses:
-        '200':
-          description: >-
-            RAGQueryResult containing the retrieved content and metadata
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/RAGQueryResult'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Query the RAG system for context; typically invoked by the agent.
-      description: >-
-        Query the RAG system for context; typically invoked by the agent.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/QueryRequest'
-        required: true
-      deprecated: false
   /v1/toolgroups:
     get:
       responses:
@@ -8708,274 +8645,6 @@ components:
       title: ListToolDefsResponse
       description: >-
         Response containing a list of tool definitions.
-    RAGDocument:
-      type: object
-      properties:
-        document_id:
-          type: string
-          description: The unique identifier for the document.
-        content:
-          oneOf:
-            - type: string
-            - $ref: '#/components/schemas/InterleavedContentItem'
-            - type: array
-              items:
-                $ref: '#/components/schemas/InterleavedContentItem'
-            - $ref: '#/components/schemas/URL'
-          description: The content of the document.
-        mime_type:
-          type: string
-          description: The MIME type of the document.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Additional metadata for the document.
-      additionalProperties: false
-      required:
-        - document_id
-        - content
-        - metadata
-      title: RAGDocument
-      description: >-
-        A document to be used for document ingestion in the RAG Tool.
-    InsertRequest:
-      type: object
-      properties:
-        documents:
-          type: array
-          items:
-            $ref: '#/components/schemas/RAGDocument'
-          description: >-
-            List of documents to index in the RAG system
-        vector_db_id:
-          type: string
-          description: >-
-            ID of the vector database to store the document embeddings
-        chunk_size_in_tokens:
-          type: integer
-          description: >-
-            (Optional) Size in tokens for document chunking during indexing
-      additionalProperties: false
-      required:
-        - documents
-        - vector_db_id
-        - chunk_size_in_tokens
-      title: InsertRequest
-    DefaultRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: default
-          default: default
-          description: >-
-            Type of query generator, always 'default'
-        separator:
-          type: string
-          default: ' '
-          description: >-
-            String separator used to join query terms
-      additionalProperties: false
-      required:
-        - type
-        - separator
-      title: DefaultRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the default RAG query generator.
-    LLMRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm
-          default: llm
-          description: Type of query generator, always 'llm'
-        model:
-          type: string
-          description: >-
-            Name of the language model to use for query generation
-        template:
-          type: string
-          description: >-
-            Template string for formatting the query generation prompt
-      additionalProperties: false
-      required:
-        - type
-        - model
-        - template
-      title: LLMRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the LLM-based RAG query generator.
-    RAGQueryConfig:
-      type: object
-      properties:
-        query_generator_config:
-          oneOf:
-            - $ref: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-            - $ref: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          discriminator:
-            propertyName: type
-            mapping:
-              default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-              llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          description: Configuration for the query generator.
-        max_tokens_in_context:
-          type: integer
-          default: 4096
-          description: Maximum number of tokens in the context.
-        max_chunks:
-          type: integer
-          default: 5
-          description: Maximum number of chunks to retrieve.
-        chunk_template:
-          type: string
-          default: >
-            Result {index}
-
-            Content: {chunk.content}
-
-            Metadata: {metadata}
-          description: >-
-            Template for formatting each retrieved chunk in the context. Available
-            placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk
-            content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
-            {chunk.content}\nMetadata: {metadata}\n"
-        mode:
-          $ref: '#/components/schemas/RAGSearchMode'
-          default: vector
-          description: >-
-            Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
-            "vector".
-        ranker:
-          $ref: '#/components/schemas/Ranker'
-          description: >-
-            Configuration for the ranker to use in hybrid search. Defaults to RRF
-            ranker.
-      additionalProperties: false
-      required:
-        - query_generator_config
-        - max_tokens_in_context
-        - max_chunks
-        - chunk_template
-      title: RAGQueryConfig
-      description: >-
-        Configuration for the RAG query generation.
-    RAGSearchMode:
-      type: string
-      enum:
-        - vector
-        - keyword
-        - hybrid
-      title: RAGSearchMode
-      description: >-
-        Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search
-        for semantic matching - KEYWORD: Uses keyword-based search for exact matching
-        - HYBRID: Combines both vector and keyword search for better results
-    RRFRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: rrf
-          default: rrf
-          description: The type of ranker, always "rrf"
-        impact_factor:
-          type: number
-          default: 60.0
-          description: >-
-            The impact factor for RRF scoring. Higher values give more weight to higher-ranked
-            results. Must be greater than 0
-      additionalProperties: false
-      required:
-        - type
-        - impact_factor
-      title: RRFRanker
-      description: >-
-        Reciprocal Rank Fusion (RRF) ranker configuration.
-    Ranker:
-      oneOf:
-        - $ref: '#/components/schemas/RRFRanker'
-        - $ref: '#/components/schemas/WeightedRanker'
-      discriminator:
-        propertyName: type
-        mapping:
-          rrf: '#/components/schemas/RRFRanker'
-          weighted: '#/components/schemas/WeightedRanker'
-    WeightedRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: weighted
-          default: weighted
-          description: The type of ranker, always "weighted"
-        alpha:
-          type: number
-          default: 0.5
-          description: >-
-            Weight factor between 0 and 1. 0 means only use keyword scores, 1 means
-            only use vector scores, values in between blend both scores.
-      additionalProperties: false
-      required:
-        - type
-        - alpha
-      title: WeightedRanker
-      description: >-
-        Weighted ranker configuration that combines vector and keyword scores.
-    QueryRequest:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The query content to search for in the indexed documents
-        vector_db_ids:
-          type: array
-          items:
-            type: string
-          description: >-
-            List of vector database IDs to search within
-        query_config:
-          $ref: '#/components/schemas/RAGQueryConfig'
-          description: >-
-            (Optional) Configuration parameters for the query operation
-      additionalProperties: false
-      required:
-        - content
-        - vector_db_ids
-      title: QueryRequest
-    RAGQueryResult:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            (Optional) The retrieved content from the query
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            Additional metadata about the query result
-      additionalProperties: false
-      required:
-        - metadata
-      title: RAGQueryResult
-      description: >-
-        Result of a RAG query containing retrieved content and metadata.
     ToolGroup:
       type: object
       properties:
diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html
index 38122ebc0..7930b28e6 100644
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@@ -2624,89 +2624,6 @@
                 "deprecated": false
             }
         },
-        "/v1/tool-runtime/rag-tool/insert": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "OK"
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "ToolRuntime"
-                ],
-                "summary": "Index documents so they can be used by the RAG system.",
-                "description": "Index documents so they can be used by the RAG system.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/InsertRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                },
-                "deprecated": false
-            }
-        },
-        "/v1/tool-runtime/rag-tool/query": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "RAGQueryResult containing the retrieved content and metadata",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/RAGQueryResult"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "ToolRuntime"
-                ],
-                "summary": "Query the RAG system for context; typically invoked by the agent.",
-                "description": "Query the RAG system for context; typically invoked by the agent.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/QueryRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                },
-                "deprecated": false
-            }
-        },
         "/v1/toolgroups": {
             "get": {
                 "responses": {
@@ -13055,346 +12972,6 @@
                 "title": "ListToolDefsResponse",
                 "description": "Response containing a list of tool definitions."
             },
-            "RAGDocument": {
-                "type": "object",
-                "properties": {
-                    "document_id": {
-                        "type": "string",
-                        "description": "The unique identifier for the document."
-                    },
-                    "content": {
-                        "oneOf": [
-                            {
-                                "type": "string"
-                            },
-                            {
-                                "$ref": "#/components/schemas/InterleavedContentItem"
-                            },
-                            {
-                                "type": "array",
-                                "items": {
-                                    "$ref": "#/components/schemas/InterleavedContentItem"
-                                }
-                            },
-                            {
-                                "$ref": "#/components/schemas/URL"
-                            }
-                        ],
-                        "description": "The content of the document."
-                    },
-                    "mime_type": {
-                        "type": "string",
-                        "description": "The MIME type of the document."
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "Additional metadata for the document."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "document_id",
-                    "content",
-                    "metadata"
-                ],
-                "title": "RAGDocument",
-                "description": "A document to be used for document ingestion in the RAG Tool."
-            },
-            "InsertRequest": {
-                "type": "object",
-                "properties": {
-                    "documents": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/RAGDocument"
-                        },
-                        "description": "List of documents to index in the RAG system"
-                    },
-                    "vector_db_id": {
-                        "type": "string",
-                        "description": "ID of the vector database to store the document embeddings"
-                    },
-                    "chunk_size_in_tokens": {
-                        "type": "integer",
-                        "description": "(Optional) Size in tokens for document chunking during indexing"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "documents",
-                    "vector_db_id",
-                    "chunk_size_in_tokens"
-                ],
-                "title": "InsertRequest"
-            },
-            "DefaultRAGQueryGeneratorConfig": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "default",
-                        "default": "default",
-                        "description": "Type of query generator, always 'default'"
-                    },
-                    "separator": {
-                        "type": "string",
-                        "default": " ",
-                        "description": "String separator used to join query terms"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "separator"
-                ],
-                "title": "DefaultRAGQueryGeneratorConfig",
-                "description": "Configuration for the default RAG query generator."
-            },
-            "LLMRAGQueryGeneratorConfig": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "llm",
-                        "default": "llm",
-                        "description": "Type of query generator, always 'llm'"
-                    },
-                    "model": {
-                        "type": "string",
-                        "description": "Name of the language model to use for query generation"
-                    },
-                    "template": {
-                        "type": "string",
-                        "description": "Template string for formatting the query generation prompt"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "model",
-                    "template"
-                ],
-                "title": "LLMRAGQueryGeneratorConfig",
-                "description": "Configuration for the LLM-based RAG query generator."
-            },
-            "RAGQueryConfig": {
-                "type": "object",
-                "properties": {
-                    "query_generator_config": {
-                        "oneOf": [
-                            {
-                                "$ref": "#/components/schemas/DefaultRAGQueryGeneratorConfig"
-                            },
-                            {
-                                "$ref": "#/components/schemas/LLMRAGQueryGeneratorConfig"
-                            }
-                        ],
-                        "discriminator": {
-                            "propertyName": "type",
-                            "mapping": {
-                                "default": "#/components/schemas/DefaultRAGQueryGeneratorConfig",
-                                "llm": "#/components/schemas/LLMRAGQueryGeneratorConfig"
-                            }
-                        },
-                        "description": "Configuration for the query generator."
-                    },
-                    "max_tokens_in_context": {
-                        "type": "integer",
-                        "default": 4096,
-                        "description": "Maximum number of tokens in the context."
-                    },
-                    "max_chunks": {
-                        "type": "integer",
-                        "default": 5,
-                        "description": "Maximum number of chunks to retrieve."
-                    },
-                    "chunk_template": {
-                        "type": "string",
-                        "default": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
-                        "description": "Template for formatting each retrieved chunk in the context. Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict). Default: \"Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n\""
-                    },
-                    "mode": {
-                        "$ref": "#/components/schemas/RAGSearchMode",
-                        "default": "vector",
-                        "description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"."
-                    },
-                    "ranker": {
-                        "$ref": "#/components/schemas/Ranker",
-                        "description": "Configuration for the ranker to use in hybrid search. Defaults to RRF ranker."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "query_generator_config",
-                    "max_tokens_in_context",
-                    "max_chunks",
-                    "chunk_template"
-                ],
-                "title": "RAGQueryConfig",
-                "description": "Configuration for the RAG query generation."
-            },
-            "RAGSearchMode": {
-                "type": "string",
-                "enum": [
-                    "vector",
-                    "keyword",
-                    "hybrid"
-                ],
-                "title": "RAGSearchMode",
-                "description": "Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search for semantic matching - KEYWORD: Uses keyword-based search for exact matching - HYBRID: Combines both vector and keyword search for better results"
-            },
-            "RRFRanker": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "rrf",
-                        "default": "rrf",
-                        "description": "The type of ranker, always \"rrf\""
-                    },
-                    "impact_factor": {
-                        "type": "number",
-                        "default": 60.0,
-                        "description": "The impact factor for RRF scoring. Higher values give more weight to higher-ranked results. Must be greater than 0"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "impact_factor"
-                ],
-                "title": "RRFRanker",
-                "description": "Reciprocal Rank Fusion (RRF) ranker configuration."
-            },
-            "Ranker": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/RRFRanker"
-                    },
-                    {
-                        "$ref": "#/components/schemas/WeightedRanker"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "rrf": "#/components/schemas/RRFRanker",
-                        "weighted": "#/components/schemas/WeightedRanker"
-                    }
-                }
-            },
-            "WeightedRanker": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "weighted",
-                        "default": "weighted",
-                        "description": "The type of ranker, always \"weighted\""
-                    },
-                    "alpha": {
-                        "type": "number",
-                        "default": 0.5,
-                        "description": "Weight factor between 0 and 1. 0 means only use keyword scores, 1 means only use vector scores, values in between blend both scores."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "type",
-                    "alpha"
-                ],
-                "title": "WeightedRanker",
-                "description": "Weighted ranker configuration that combines vector and keyword scores."
-            },
-            "QueryRequest": {
-                "type": "object",
-                "properties": {
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The query content to search for in the indexed documents"
-                    },
-                    "vector_db_ids": {
-                        "type": "array",
-                        "items": {
-                            "type": "string"
-                        },
-                        "description": "List of vector database IDs to search within"
-                    },
-                    "query_config": {
-                        "$ref": "#/components/schemas/RAGQueryConfig",
-                        "description": "(Optional) Configuration parameters for the query operation"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "content",
-                    "vector_db_ids"
-                ],
-                "title": "QueryRequest"
-            },
-            "RAGQueryResult": {
-                "type": "object",
-                "properties": {
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "(Optional) The retrieved content from the query"
-                    },
-                    "metadata": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        },
-                        "description": "Additional metadata about the query result"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "metadata"
-                ],
-                "title": "RAGQueryResult",
-                "description": "Result of a RAG query containing retrieved content and metadata."
-            },
             "ToolGroup": {
                 "type": "object",
                 "properties": {
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
index 93049a14a..98a309f12 100644
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@@ -2039,69 +2039,6 @@ paths:
           schema:
             $ref: '#/components/schemas/URL'
       deprecated: false
-  /v1/tool-runtime/rag-tool/insert:
-    post:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Index documents so they can be used by the RAG system.
-      description: >-
-        Index documents so they can be used by the RAG system.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/InsertRequest'
-        required: true
-      deprecated: false
-  /v1/tool-runtime/rag-tool/query:
-    post:
-      responses:
-        '200':
-          description: >-
-            RAGQueryResult containing the retrieved content and metadata
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/RAGQueryResult'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Query the RAG system for context; typically invoked by the agent.
-      description: >-
-        Query the RAG system for context; typically invoked by the agent.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/QueryRequest'
-        required: true
-      deprecated: false
   /v1/toolgroups:
     get:
       responses:
@@ -9921,274 +9858,6 @@ components:
       title: ListToolDefsResponse
       description: >-
         Response containing a list of tool definitions.
-    RAGDocument:
-      type: object
-      properties:
-        document_id:
-          type: string
-          description: The unique identifier for the document.
-        content:
-          oneOf:
-            - type: string
-            - $ref: '#/components/schemas/InterleavedContentItem'
-            - type: array
-              items:
-                $ref: '#/components/schemas/InterleavedContentItem'
-            - $ref: '#/components/schemas/URL'
-          description: The content of the document.
-        mime_type:
-          type: string
-          description: The MIME type of the document.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Additional metadata for the document.
-      additionalProperties: false
-      required:
-        - document_id
-        - content
-        - metadata
-      title: RAGDocument
-      description: >-
-        A document to be used for document ingestion in the RAG Tool.
-    InsertRequest:
-      type: object
-      properties:
-        documents:
-          type: array
-          items:
-            $ref: '#/components/schemas/RAGDocument'
-          description: >-
-            List of documents to index in the RAG system
-        vector_db_id:
-          type: string
-          description: >-
-            ID of the vector database to store the document embeddings
-        chunk_size_in_tokens:
-          type: integer
-          description: >-
-            (Optional) Size in tokens for document chunking during indexing
-      additionalProperties: false
-      required:
-        - documents
-        - vector_db_id
-        - chunk_size_in_tokens
-      title: InsertRequest
-    DefaultRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: default
-          default: default
-          description: >-
-            Type of query generator, always 'default'
-        separator:
-          type: string
-          default: ' '
-          description: >-
-            String separator used to join query terms
-      additionalProperties: false
-      required:
-        - type
-        - separator
-      title: DefaultRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the default RAG query generator.
-    LLMRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm
-          default: llm
-          description: Type of query generator, always 'llm'
-        model:
-          type: string
-          description: >-
-            Name of the language model to use for query generation
-        template:
-          type: string
-          description: >-
-            Template string for formatting the query generation prompt
-      additionalProperties: false
-      required:
-        - type
-        - model
-        - template
-      title: LLMRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the LLM-based RAG query generator.
-    RAGQueryConfig:
-      type: object
-      properties:
-        query_generator_config:
-          oneOf:
-            - $ref: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-            - $ref: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          discriminator:
-            propertyName: type
-            mapping:
-              default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-              llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          description: Configuration for the query generator.
-        max_tokens_in_context:
-          type: integer
-          default: 4096
-          description: Maximum number of tokens in the context.
-        max_chunks:
-          type: integer
-          default: 5
-          description: Maximum number of chunks to retrieve.
-        chunk_template:
-          type: string
-          default: >
-            Result {index}
-
-            Content: {chunk.content}
-
-            Metadata: {metadata}
-          description: >-
-            Template for formatting each retrieved chunk in the context. Available
-            placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk
-            content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
-            {chunk.content}\nMetadata: {metadata}\n"
-        mode:
-          $ref: '#/components/schemas/RAGSearchMode'
-          default: vector
-          description: >-
-            Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
-            "vector".
-        ranker:
-          $ref: '#/components/schemas/Ranker'
-          description: >-
-            Configuration for the ranker to use in hybrid search. Defaults to RRF
-            ranker.
-      additionalProperties: false
-      required:
-        - query_generator_config
-        - max_tokens_in_context
-        - max_chunks
-        - chunk_template
-      title: RAGQueryConfig
-      description: >-
-        Configuration for the RAG query generation.
-    RAGSearchMode:
-      type: string
-      enum:
-        - vector
-        - keyword
-        - hybrid
-      title: RAGSearchMode
-      description: >-
-        Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search
-        for semantic matching - KEYWORD: Uses keyword-based search for exact matching
-        - HYBRID: Combines both vector and keyword search for better results
-    RRFRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: rrf
-          default: rrf
-          description: The type of ranker, always "rrf"
-        impact_factor:
-          type: number
-          default: 60.0
-          description: >-
-            The impact factor for RRF scoring. Higher values give more weight to higher-ranked
-            results. Must be greater than 0
-      additionalProperties: false
-      required:
-        - type
-        - impact_factor
-      title: RRFRanker
-      description: >-
-        Reciprocal Rank Fusion (RRF) ranker configuration.
-    Ranker:
-      oneOf:
-        - $ref: '#/components/schemas/RRFRanker'
-        - $ref: '#/components/schemas/WeightedRanker'
-      discriminator:
-        propertyName: type
-        mapping:
-          rrf: '#/components/schemas/RRFRanker'
-          weighted: '#/components/schemas/WeightedRanker'
-    WeightedRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: weighted
-          default: weighted
-          description: The type of ranker, always "weighted"
-        alpha:
-          type: number
-          default: 0.5
-          description: >-
-            Weight factor between 0 and 1. 0 means only use keyword scores, 1 means
-            only use vector scores, values in between blend both scores.
-      additionalProperties: false
-      required:
-        - type
-        - alpha
-      title: WeightedRanker
-      description: >-
-        Weighted ranker configuration that combines vector and keyword scores.
-    QueryRequest:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The query content to search for in the indexed documents
-        vector_db_ids:
-          type: array
-          items:
-            type: string
-          description: >-
-            List of vector database IDs to search within
-        query_config:
-          $ref: '#/components/schemas/RAGQueryConfig'
-          description: >-
-            (Optional) Configuration parameters for the query operation
-      additionalProperties: false
-      required:
-        - content
-        - vector_db_ids
-      title: QueryRequest
-    RAGQueryResult:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            (Optional) The retrieved content from the query
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            Additional metadata about the query result
-      additionalProperties: false
-      required:
-        - metadata
-      title: RAGQueryResult
-      description: >-
-        Result of a RAG query containing retrieved content and metadata.
     ToolGroup:
       type: object
       properties:
diff --git a/llama_stack/apis/tools/__init__.py b/llama_stack/apis/tools/__init__.py
index b25310ecf..2908d1c62 100644
--- a/llama_stack/apis/tools/__init__.py
+++ b/llama_stack/apis/tools/__init__.py
@@ -4,5 +4,4 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from .rag_tool import *
 from .tools import *
diff --git a/llama_stack/apis/tools/rag_tool.py b/llama_stack/apis/tools/rag_tool.py
deleted file mode 100644
index ed7847e23..000000000
--- a/llama_stack/apis/tools/rag_tool.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum, StrEnum
-from typing import Annotated, Any, Literal, Protocol
-
-from pydantic import BaseModel, Field, field_validator
-from typing_extensions import runtime_checkable
-
-from llama_stack.apis.common.content_types import URL, InterleavedContent
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-@json_schema_type
-class RRFRanker(BaseModel):
-    """
-    Reciprocal Rank Fusion (RRF) ranker configuration.
-
-    :param type: The type of ranker, always "rrf"
-    :param impact_factor: The impact factor for RRF scoring. Higher values give more weight to higher-ranked results.
-                         Must be greater than 0
-    """
-
-    type: Literal["rrf"] = "rrf"
-    impact_factor: float = Field(default=60.0, gt=0.0)  # default of 60 for optimal performance
-
-
-@json_schema_type
-class WeightedRanker(BaseModel):
-    """
-    Weighted ranker configuration that combines vector and keyword scores.
-
-    :param type: The type of ranker, always "weighted"
-    :param alpha: Weight factor between 0 and 1.
-                 0 means only use keyword scores,
-                 1 means only use vector scores,
-                 values in between blend both scores.
-    """
-
-    type: Literal["weighted"] = "weighted"
-    alpha: float = Field(
-        default=0.5,
-        ge=0.0,
-        le=1.0,
-        description="Weight factor between 0 and 1. 0 means only keyword scores, 1 means only vector scores.",
-    )
-
-
-Ranker = Annotated[
-    RRFRanker | WeightedRanker,
-    Field(discriminator="type"),
-]
-register_schema(Ranker, name="Ranker")
-
-
-@json_schema_type
-class RAGDocument(BaseModel):
-    """
-    A document to be used for document ingestion in the RAG Tool.
-
-    :param document_id: The unique identifier for the document.
-    :param content: The content of the document.
-    :param mime_type: The MIME type of the document.
-    :param metadata: Additional metadata for the document.
-    """
-
-    document_id: str
-    content: InterleavedContent | URL
-    mime_type: str | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class RAGQueryResult(BaseModel):
-    """Result of a RAG query containing retrieved content and metadata.
-
-    :param content: (Optional) The retrieved content from the query
-    :param metadata: Additional metadata about the query result
-    """
-
-    content: InterleavedContent | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class RAGQueryGenerator(Enum):
-    """Types of query generators for RAG systems.
-
-    :cvar default: Default query generator using simple text processing
-    :cvar llm: LLM-based query generator for enhanced query understanding
-    :cvar custom: Custom query generator implementation
-    """
-
-    default = "default"
-    llm = "llm"
-    custom = "custom"
-
-
-@json_schema_type
-class RAGSearchMode(StrEnum):
-    """
-    Search modes for RAG query retrieval:
-    - VECTOR: Uses vector similarity search for semantic matching
-    - KEYWORD: Uses keyword-based search for exact matching
-    - HYBRID: Combines both vector and keyword search for better results
-    """
-
-    VECTOR = "vector"
-    KEYWORD = "keyword"
-    HYBRID = "hybrid"
-
-
-@json_schema_type
-class DefaultRAGQueryGeneratorConfig(BaseModel):
-    """Configuration for the default RAG query generator.
-
-    :param type: Type of query generator, always 'default'
-    :param separator: String separator used to join query terms
-    """
-
-    type: Literal["default"] = "default"
-    separator: str = " "
-
-
-@json_schema_type
-class LLMRAGQueryGeneratorConfig(BaseModel):
-    """Configuration for the LLM-based RAG query generator.
-
-    :param type: Type of query generator, always 'llm'
-    :param model: Name of the language model to use for query generation
-    :param template: Template string for formatting the query generation prompt
-    """
-
-    type: Literal["llm"] = "llm"
-    model: str
-    template: str
-
-
-RAGQueryGeneratorConfig = Annotated[
-    DefaultRAGQueryGeneratorConfig | LLMRAGQueryGeneratorConfig,
-    Field(discriminator="type"),
-]
-register_schema(RAGQueryGeneratorConfig, name="RAGQueryGeneratorConfig")
-
-
-@json_schema_type
-class RAGQueryConfig(BaseModel):
-    """
-    Configuration for the RAG query generation.
-
-    :param query_generator_config: Configuration for the query generator.
-    :param max_tokens_in_context: Maximum number of tokens in the context.
-    :param max_chunks: Maximum number of chunks to retrieve.
-    :param chunk_template: Template for formatting each retrieved chunk in the context.
-        Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict).
-        Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n"
-    :param mode: Search mode for retrieval—either "vector", "keyword", or "hybrid". Default "vector".
-    :param ranker: Configuration for the ranker to use in hybrid search. Defaults to RRF ranker.
-    """
-
-    # This config defines how a query is generated using the messages
-    # for memory bank retrieval.
-    query_generator_config: RAGQueryGeneratorConfig = Field(default=DefaultRAGQueryGeneratorConfig())
-    max_tokens_in_context: int = 4096
-    max_chunks: int = 5
-    chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
-    mode: RAGSearchMode | None = RAGSearchMode.VECTOR
-    ranker: Ranker | None = Field(default=None)  # Only used for hybrid mode
-
-    @field_validator("chunk_template")
-    def validate_chunk_template(cls, v: str) -> str:
-        if "{chunk.content}" not in v:
-            raise ValueError("chunk_template must contain {chunk.content}")
-        if "{index}" not in v:
-            raise ValueError("chunk_template must contain {index}")
-        if len(v) == 0:
-            raise ValueError("chunk_template must not be empty")
-        return v
-
-
-@runtime_checkable
-@trace_protocol
-class RAGToolRuntime(Protocol):
-    @webmethod(route="/tool-runtime/rag-tool/insert", method="POST", level=LLAMA_STACK_API_V1)
-    async def insert(
-        self,
-        documents: list[RAGDocument],
-        vector_db_id: str,
-        chunk_size_in_tokens: int = 512,
-    ) -> None:
-        """Index documents so they can be used by the RAG system.
-
-        :param documents: List of documents to index in the RAG system
-        :param vector_db_id: ID of the vector database to store the document embeddings
-        :param chunk_size_in_tokens: (Optional) Size in tokens for document chunking during indexing
-        """
-        ...
-
-    @webmethod(route="/tool-runtime/rag-tool/query", method="POST", level=LLAMA_STACK_API_V1)
-    async def query(
-        self,
-        content: InterleavedContent,
-        vector_db_ids: list[str],
-        query_config: RAGQueryConfig | None = None,
-    ) -> RAGQueryResult:
-        """Query the RAG system for context; typically invoked by the agent.
-
-        :param content: The query content to search for in the indexed documents
-        :param vector_db_ids: List of vector database IDs to search within
-        :param query_config: (Optional) Configuration parameters for the query operation
-        :returns: RAGQueryResult containing the retrieved content and metadata
-        """
-        ...
diff --git a/llama_stack/apis/tools/tools.py b/llama_stack/apis/tools/tools.py
index b6a1a2543..feac0d33e 100644
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from enum import Enum
 from typing import Any, Literal, Protocol
 
 from pydantic import BaseModel
@@ -16,8 +15,6 @@ from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
 
-from .rag_tool import RAGToolRuntime
-
 
 @json_schema_type
 class ToolDef(BaseModel):
@@ -181,22 +178,11 @@ class ToolGroups(Protocol):
         ...
 
 
-class SpecialToolGroup(Enum):
-    """Special tool groups with predefined functionality.
-
-    :cvar rag_tool: Retrieval-Augmented Generation tool group for document search and retrieval
-    """
-
-    rag_tool = "rag_tool"
-
-
 @runtime_checkable
 @trace_protocol
 class ToolRuntime(Protocol):
     tool_store: ToolStore | None = None
 
-    rag_tool: RAGToolRuntime | None = None
-
     # TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed.
     @webmethod(route="/tool-runtime/list-tools", method="GET", level=LLAMA_STACK_API_V1)
     async def list_runtime_tools(
diff --git a/llama_stack/core/routers/tool_runtime.py b/llama_stack/core/routers/tool_runtime.py
index be4c13905..7c5bb25c6 100644
--- a/llama_stack/core/routers/tool_runtime.py
+++ b/llama_stack/core/routers/tool_runtime.py
@@ -8,16 +8,8 @@ from typing import Any
 
 from llama_stack.apis.common.content_types import (
     URL,
-    InterleavedContent,
-)
-from llama_stack.apis.tools import (
-    ListToolDefsResponse,
-    RAGDocument,
-    RAGQueryConfig,
-    RAGQueryResult,
-    RAGToolRuntime,
-    ToolRuntime,
 )
+from llama_stack.apis.tools import ListToolDefsResponse, ToolRuntime
 from llama_stack.log import get_logger
 
 from ..routing_tables.toolgroups import ToolGroupsRoutingTable
@@ -26,36 +18,6 @@ logger = get_logger(name=__name__, category="core::routers")
 
 
 class ToolRuntimeRouter(ToolRuntime):
-    class RagToolImpl(RAGToolRuntime):
-        def __init__(
-            self,
-            routing_table: ToolGroupsRoutingTable,
-        ) -> None:
-            logger.debug("Initializing ToolRuntimeRouter.RagToolImpl")
-            self.routing_table = routing_table
-
-        async def query(
-            self,
-            content: InterleavedContent,
-            vector_store_ids: list[str],
-            query_config: RAGQueryConfig | None = None,
-        ) -> RAGQueryResult:
-            logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_store_ids}")
-            provider = await self.routing_table.get_provider_impl("knowledge_search")
-            return await provider.query(content, vector_store_ids, query_config)
-
-        async def insert(
-            self,
-            documents: list[RAGDocument],
-            vector_store_id: str,
-            chunk_size_in_tokens: int = 512,
-        ) -> None:
-            logger.debug(
-                f"ToolRuntimeRouter.RagToolImpl.insert: {vector_store_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
-            )
-            provider = await self.routing_table.get_provider_impl("insert_into_memory")
-            return await provider.insert(documents, vector_store_id, chunk_size_in_tokens)
-
     def __init__(
         self,
         routing_table: ToolGroupsRoutingTable,
@@ -63,11 +25,6 @@ class ToolRuntimeRouter(ToolRuntime):
         logger.debug("Initializing ToolRuntimeRouter")
         self.routing_table = routing_table
 
-        # HACK ALERT this should be in sync with "get_all_api_endpoints()"
-        self.rag_tool = self.RagToolImpl(routing_table)
-        for method in ("query", "insert"):
-            setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
-
     async def initialize(self) -> None:
         logger.debug("ToolRuntimeRouter.initialize")
         pass
diff --git a/llama_stack/core/server/routes.py b/llama_stack/core/server/routes.py
index 4970d0bf8..ed76ea86f 100644
--- a/llama_stack/core/server/routes.py
+++ b/llama_stack/core/server/routes.py
@@ -13,7 +13,6 @@ from aiohttp import hdrs
 from starlette.routing import Route
 
 from llama_stack.apis.datatypes import Api, ExternalApiSpec
-from llama_stack.apis.tools import RAGToolRuntime, SpecialToolGroup
 from llama_stack.core.resolver import api_protocol_map
 from llama_stack.schema_utils import WebMethod
 
@@ -25,33 +24,16 @@ RouteImpls = dict[str, PathImpl]
 RouteMatch = tuple[EndpointFunc, PathParams, str, WebMethod]
 
 
-def toolgroup_protocol_map():
-    return {
-        SpecialToolGroup.rag_tool: RAGToolRuntime,
-    }
-
-
 def get_all_api_routes(
     external_apis: dict[Api, ExternalApiSpec] | None = None,
 ) -> dict[Api, list[tuple[Route, WebMethod]]]:
     apis = {}
 
     protocols = api_protocol_map(external_apis)
-    toolgroup_protocols = toolgroup_protocol_map()
     for api, protocol in protocols.items():
         routes = []
         protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
 
-        # HACK ALERT
-        if api == Api.tool_runtime:
-            for tool_group in SpecialToolGroup:
-                sub_protocol = toolgroup_protocols[tool_group]
-                sub_protocol_methods = inspect.getmembers(sub_protocol, predicate=inspect.isfunction)
-                for name, method in sub_protocol_methods:
-                    if not hasattr(method, "__webmethod__"):
-                        continue
-                    protocol_methods.append((f"{tool_group.value}.{name}", method))
-
         for name, method in protocol_methods:
             # Get all webmethods for this method (supports multiple decorators)
             webmethods = getattr(method, "__webmethods__", [])
diff --git a/llama_stack/core/stack.py b/llama_stack/core/stack.py
index 4cf1d072d..49100b4bc 100644
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@@ -32,7 +32,7 @@ from llama_stack.apis.scoring_functions import ScoringFunctions
 from llama_stack.apis.shields import Shields
 from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration
 from llama_stack.apis.telemetry import Telemetry
-from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime
+from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl
 from llama_stack.core.datatypes import Provider, StackRunConfig, VectorStoresConfig
@@ -80,7 +80,6 @@ class LlamaStack(
     Inspect,
     ToolGroups,
     ToolRuntime,
-    RAGToolRuntime,
     Files,
     Prompts,
     Conversations,
diff --git a/llama_stack/distributions/ci-tests/build.yaml b/llama_stack/distributions/ci-tests/build.yaml
index c01e415a9..3cf43de15 100644
--- a/llama_stack/distributions/ci-tests/build.yaml
+++ b/llama_stack/distributions/ci-tests/build.yaml
@@ -48,7 +48,6 @@ distribution_spec:
     tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
     - provider_type: remote::model-context-protocol
     batches:
     - provider_type: inline::reference
diff --git a/llama_stack/distributions/ci-tests/run.yaml b/llama_stack/distributions/ci-tests/run.yaml
index ecf9eed3b..f403527fc 100644
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@@ -216,8 +216,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:=}
       max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
   batches:
@@ -263,8 +261,6 @@ registered_resources:
   tool_groups:
   - toolgroup_id: builtin::websearch
     provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/dell/build.yaml b/llama_stack/distributions/dell/build.yaml
index 7bc26ca9e..0275a47a1 100644
--- a/llama_stack/distributions/dell/build.yaml
+++ b/llama_stack/distributions/dell/build.yaml
@@ -26,7 +26,6 @@ distribution_spec:
     tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
 image_type: venv
 additional_pip_packages:
 - aiosqlite
diff --git a/llama_stack/distributions/dell/dell.py b/llama_stack/distributions/dell/dell.py
index 88e72688f..708ba0b10 100644
--- a/llama_stack/distributions/dell/dell.py
+++ b/llama_stack/distributions/dell/dell.py
@@ -45,7 +45,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
-            BuildProvider(provider_type="inline::rag-runtime"),
         ],
     }
     name = "dell"
@@ -98,10 +97,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::websearch",
             provider_id="brave-search",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/distributions/dell/run-with-safety.yaml b/llama_stack/distributions/dell/run-with-safety.yaml
index 2563f2f4b..062c50e2b 100644
--- a/llama_stack/distributions/dell/run-with-safety.yaml
+++ b/llama_stack/distributions/dell/run-with-safety.yaml
@@ -87,8 +87,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:=}
       max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
 storage:
   backends:
     kv_default:
@@ -133,8 +131,6 @@ registered_resources:
   tool_groups:
   - toolgroup_id: builtin::websearch
     provider_id: brave-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/dell/run.yaml b/llama_stack/distributions/dell/run.yaml
index 7bada394f..42e0658bd 100644
--- a/llama_stack/distributions/dell/run.yaml
+++ b/llama_stack/distributions/dell/run.yaml
@@ -83,8 +83,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:=}
       max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
 storage:
   backends:
     kv_default:
@@ -124,8 +122,6 @@ registered_resources:
   tool_groups:
   - toolgroup_id: builtin::websearch
     provider_id: brave-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/meta-reference-gpu/build.yaml b/llama_stack/distributions/meta-reference-gpu/build.yaml
index 1513742a7..74da29bb8 100644
--- a/llama_stack/distributions/meta-reference-gpu/build.yaml
+++ b/llama_stack/distributions/meta-reference-gpu/build.yaml
@@ -24,7 +24,6 @@ distribution_spec:
     tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
     - provider_type: remote::model-context-protocol
 image_type: venv
 additional_pip_packages:
diff --git a/llama_stack/distributions/meta-reference-gpu/meta_reference.py b/llama_stack/distributions/meta-reference-gpu/meta_reference.py
index 4e4ddef33..aa66d43a0 100644
--- a/llama_stack/distributions/meta-reference-gpu/meta_reference.py
+++ b/llama_stack/distributions/meta-reference-gpu/meta_reference.py
@@ -47,7 +47,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
-            BuildProvider(provider_type="inline::rag-runtime"),
             BuildProvider(provider_type="remote::model-context-protocol"),
         ],
     }
@@ -92,10 +91,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::websearch",
             provider_id="tavily-search",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
     ]
 
     return DistributionTemplate(
diff --git a/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml b/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
index 01b5db4f9..6e74201db 100644
--- a/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml
@@ -98,8 +98,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:=}
       max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
 storage:
@@ -146,8 +144,6 @@ registered_resources:
   tool_groups:
   - toolgroup_id: builtin::websearch
     provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/meta-reference-gpu/run.yaml b/llama_stack/distributions/meta-reference-gpu/run.yaml
index 87c33dde0..92934ca74 100644
--- a/llama_stack/distributions/meta-reference-gpu/run.yaml
+++ b/llama_stack/distributions/meta-reference-gpu/run.yaml
@@ -88,8 +88,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:=}
       max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
 storage:
@@ -131,8 +129,6 @@ registered_resources:
   tool_groups:
   - toolgroup_id: builtin::websearch
     provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/nvidia/build.yaml b/llama_stack/distributions/nvidia/build.yaml
index 8ddd12439..3412ea15b 100644
--- a/llama_stack/distributions/nvidia/build.yaml
+++ b/llama_stack/distributions/nvidia/build.yaml
@@ -19,8 +19,7 @@ distribution_spec:
     - provider_type: remote::nvidia
     scoring:
     - provider_type: inline::basic
-    tool_runtime:
-    - provider_type: inline::rag-runtime
+    tool_runtime: []
     files:
     - provider_type: inline::localfs
 image_type: venv
diff --git a/llama_stack/distributions/nvidia/nvidia.py b/llama_stack/distributions/nvidia/nvidia.py
index a92a2e6f8..889f83aa5 100644
--- a/llama_stack/distributions/nvidia/nvidia.py
+++ b/llama_stack/distributions/nvidia/nvidia.py
@@ -28,7 +28,7 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
             BuildProvider(provider_type="remote::nvidia"),
         ],
         "scoring": [BuildProvider(provider_type="inline::basic")],
-        "tool_runtime": [BuildProvider(provider_type="inline::rag-runtime")],
+        "tool_runtime": [],
         "files": [BuildProvider(provider_type="inline::localfs")],
     }
 
@@ -66,12 +66,7 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
         provider_id="nvidia",
     )
 
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
+    default_tool_groups: list[ToolGroupInput] = []
 
     return DistributionTemplate(
         name=name,
diff --git a/llama_stack/distributions/nvidia/run-with-safety.yaml b/llama_stack/distributions/nvidia/run-with-safety.yaml
index c23d0f9cb..dca29ed2a 100644
--- a/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/llama_stack/distributions/nvidia/run-with-safety.yaml
@@ -80,9 +80,7 @@ providers:
   scoring:
   - provider_id: basic
     provider_type: inline::basic
-  tool_runtime:
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
+  tool_runtime: []
   files:
   - provider_id: meta-reference-files
     provider_type: inline::localfs
@@ -128,9 +126,7 @@ registered_resources:
   datasets: []
   scoring_fns: []
   benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
+  tool_groups: []
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/nvidia/run.yaml b/llama_stack/distributions/nvidia/run.yaml
index 81e744d53..e35d9c44c 100644
--- a/llama_stack/distributions/nvidia/run.yaml
+++ b/llama_stack/distributions/nvidia/run.yaml
@@ -69,9 +69,7 @@ providers:
   scoring:
   - provider_id: basic
     provider_type: inline::basic
-  tool_runtime:
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
+  tool_runtime: []
   files:
   - provider_id: meta-reference-files
     provider_type: inline::localfs
@@ -107,9 +105,7 @@ registered_resources:
   datasets: []
   scoring_fns: []
   benchmarks: []
-  tool_groups:
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
+  tool_groups: []
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/open-benchmark/build.yaml b/llama_stack/distributions/open-benchmark/build.yaml
index 05acd98e3..9fc0e9eb0 100644
--- a/llama_stack/distributions/open-benchmark/build.yaml
+++ b/llama_stack/distributions/open-benchmark/build.yaml
@@ -28,7 +28,6 @@ distribution_spec:
     tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
     - provider_type: remote::model-context-protocol
 image_type: venv
 additional_pip_packages:
diff --git a/llama_stack/distributions/open-benchmark/open_benchmark.py b/llama_stack/distributions/open-benchmark/open_benchmark.py
index 2b7760894..cceec74fd 100644
--- a/llama_stack/distributions/open-benchmark/open_benchmark.py
+++ b/llama_stack/distributions/open-benchmark/open_benchmark.py
@@ -118,7 +118,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
-            BuildProvider(provider_type="inline::rag-runtime"),
             BuildProvider(provider_type="remote::model-context-protocol"),
         ],
     }
@@ -154,10 +153,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::websearch",
             provider_id="tavily-search",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
     ]
 
     models, _ = get_model_registry(available_models)
diff --git a/llama_stack/distributions/open-benchmark/run.yaml b/llama_stack/distributions/open-benchmark/run.yaml
index 4fd0e199b..8f63e4417 100644
--- a/llama_stack/distributions/open-benchmark/run.yaml
+++ b/llama_stack/distributions/open-benchmark/run.yaml
@@ -118,8 +118,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:=}
       max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
 storage:
@@ -244,8 +242,6 @@ registered_resources:
   tool_groups:
   - toolgroup_id: builtin::websearch
     provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/postgres-demo/build.yaml b/llama_stack/distributions/postgres-demo/build.yaml
index 063dc3999..99b4edeb3 100644
--- a/llama_stack/distributions/postgres-demo/build.yaml
+++ b/llama_stack/distributions/postgres-demo/build.yaml
@@ -14,7 +14,6 @@ distribution_spec:
     tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
     - provider_type: remote::model-context-protocol
 image_type: venv
 additional_pip_packages:
diff --git a/llama_stack/distributions/postgres-demo/postgres_demo.py b/llama_stack/distributions/postgres-demo/postgres_demo.py
index 876370ef3..9f8d35cb1 100644
--- a/llama_stack/distributions/postgres-demo/postgres_demo.py
+++ b/llama_stack/distributions/postgres-demo/postgres_demo.py
@@ -45,7 +45,6 @@ def get_distribution_template() -> DistributionTemplate:
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
-            BuildProvider(provider_type="inline::rag-runtime"),
             BuildProvider(provider_type="remote::model-context-protocol"),
         ],
     }
@@ -66,10 +65,6 @@ def get_distribution_template() -> DistributionTemplate:
             toolgroup_id="builtin::websearch",
             provider_id="tavily-search",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
     ]
 
     default_models = [
diff --git a/llama_stack/distributions/postgres-demo/run.yaml b/llama_stack/distributions/postgres-demo/run.yaml
index 0d7ecff48..67222969c 100644
--- a/llama_stack/distributions/postgres-demo/run.yaml
+++ b/llama_stack/distributions/postgres-demo/run.yaml
@@ -54,8 +54,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:=}
       max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
 storage:
@@ -107,8 +105,6 @@ registered_resources:
   tool_groups:
   - toolgroup_id: builtin::websearch
     provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/starter-gpu/build.yaml b/llama_stack/distributions/starter-gpu/build.yaml
index b2e2a0c85..678d7995d 100644
--- a/llama_stack/distributions/starter-gpu/build.yaml
+++ b/llama_stack/distributions/starter-gpu/build.yaml
@@ -49,7 +49,6 @@ distribution_spec:
     tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
     - provider_type: remote::model-context-protocol
     batches:
     - provider_type: inline::reference
diff --git a/llama_stack/distributions/starter-gpu/run.yaml b/llama_stack/distributions/starter-gpu/run.yaml
index 92483c78e..4764dc02c 100644
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@@ -219,8 +219,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:=}
       max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
   batches:
@@ -266,8 +264,6 @@ registered_resources:
   tool_groups:
   - toolgroup_id: builtin::websearch
     provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/starter/build.yaml b/llama_stack/distributions/starter/build.yaml
index baa80ef3e..e6cd3c688 100644
--- a/llama_stack/distributions/starter/build.yaml
+++ b/llama_stack/distributions/starter/build.yaml
@@ -49,7 +49,6 @@ distribution_spec:
     tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
     - provider_type: remote::model-context-protocol
     batches:
     - provider_type: inline::reference
diff --git a/llama_stack/distributions/starter/run.yaml b/llama_stack/distributions/starter/run.yaml
index 3b9d8f890..88358501e 100644
--- a/llama_stack/distributions/starter/run.yaml
+++ b/llama_stack/distributions/starter/run.yaml
@@ -216,8 +216,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:=}
       max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
   batches:
@@ -263,8 +261,6 @@ registered_resources:
   tool_groups:
   - toolgroup_id: builtin::websearch
     provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/starter/starter.py b/llama_stack/distributions/starter/starter.py
index c8c7101a6..bad6279bd 100644
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@@ -140,7 +140,6 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
-            BuildProvider(provider_type="inline::rag-runtime"),
             BuildProvider(provider_type="remote::model-context-protocol"),
         ],
         "batches": [
@@ -162,10 +161,6 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
             toolgroup_id="builtin::websearch",
             provider_id="tavily-search",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
     ]
     default_shields = [
         # if the
diff --git a/llama_stack/distributions/watsonx/build.yaml b/llama_stack/distributions/watsonx/build.yaml
index dba1a94e2..d2c396085 100644
--- a/llama_stack/distributions/watsonx/build.yaml
+++ b/llama_stack/distributions/watsonx/build.yaml
@@ -23,7 +23,6 @@ distribution_spec:
     tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
     - provider_type: remote::model-context-protocol
     files:
     - provider_type: inline::localfs
diff --git a/llama_stack/distributions/watsonx/run.yaml b/llama_stack/distributions/watsonx/run.yaml
index ca3c8402d..ddc7e095f 100644
--- a/llama_stack/distributions/watsonx/run.yaml
+++ b/llama_stack/distributions/watsonx/run.yaml
@@ -83,8 +83,6 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:=}
       max_results: 3
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
   - provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
   files:
@@ -125,8 +123,6 @@ registered_resources:
   tool_groups:
   - toolgroup_id: builtin::websearch
     provider_id: tavily-search
-  - toolgroup_id: builtin::rag
-    provider_id: rag-runtime
 server:
   port: 8321
 telemetry:
diff --git a/llama_stack/distributions/watsonx/watsonx.py b/llama_stack/distributions/watsonx/watsonx.py
index d79aea872..b16f76fcb 100644
--- a/llama_stack/distributions/watsonx/watsonx.py
+++ b/llama_stack/distributions/watsonx/watsonx.py
@@ -33,7 +33,6 @@ def get_distribution_template(name: str = "watsonx") -> DistributionTemplate:
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
-            BuildProvider(provider_type="inline::rag-runtime"),
             BuildProvider(provider_type="remote::model-context-protocol"),
         ],
         "files": [BuildProvider(provider_type="inline::localfs")],
@@ -50,10 +49,6 @@ def get_distribution_template(name: str = "watsonx") -> DistributionTemplate:
             toolgroup_id="builtin::websearch",
             provider_id="tavily-search",
         ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
     ]
 
     files_provider = Provider(
diff --git a/llama_stack/providers/inline/tool_runtime/__init__.py b/llama_stack/providers/inline/tool_runtime/__init__.py
deleted file mode 100644
index 756f351d8..000000000
--- a/llama_stack/providers/inline/tool_runtime/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/llama_stack/providers/inline/tool_runtime/rag/__init__.py b/llama_stack/providers/inline/tool_runtime/rag/__init__.py
deleted file mode 100644
index f9a7e7b89..000000000
--- a/llama_stack/providers/inline/tool_runtime/rag/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.providers.datatypes import Api
-
-from .config import RagToolRuntimeConfig
-
-
-async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]):
-    from .memory import MemoryToolRuntimeImpl
-
-    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference], deps[Api.files])
-    await impl.initialize()
-    return impl
diff --git a/llama_stack/providers/inline/tool_runtime/rag/config.py b/llama_stack/providers/inline/tool_runtime/rag/config.py
deleted file mode 100644
index 43ba78e65..000000000
--- a/llama_stack/providers/inline/tool_runtime/rag/config.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from pydantic import BaseModel
-
-
-class RagToolRuntimeConfig(BaseModel):
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {}
diff --git a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py b/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
deleted file mode 100644
index 14cbec49d..000000000
--- a/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from jinja2 import Template
-
-from llama_stack.apis.common.content_types import InterleavedContent
-from llama_stack.apis.inference import OpenAIChatCompletionRequestWithExtraBody, OpenAIUserMessageParam
-from llama_stack.apis.tools.rag_tool import (
-    DefaultRAGQueryGeneratorConfig,
-    LLMRAGQueryGeneratorConfig,
-    RAGQueryGenerator,
-    RAGQueryGeneratorConfig,
-)
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    interleaved_content_as_str,
-)
-
-
-async def generate_rag_query(
-    config: RAGQueryGeneratorConfig,
-    content: InterleavedContent,
-    **kwargs,
-):
-    """
-    Generates a query that will be used for
-    retrieving relevant information from the memory bank.
-    """
-    if config.type == RAGQueryGenerator.default.value:
-        query = await default_rag_query_generator(config, content, **kwargs)
-    elif config.type == RAGQueryGenerator.llm.value:
-        query = await llm_rag_query_generator(config, content, **kwargs)
-    else:
-        raise NotImplementedError(f"Unsupported memory query generator {config.type}")
-    return query
-
-
-async def default_rag_query_generator(
-    config: DefaultRAGQueryGeneratorConfig,
-    content: InterleavedContent,
-    **kwargs,
-):
-    return interleaved_content_as_str(content, sep=config.separator)
-
-
-async def llm_rag_query_generator(
-    config: LLMRAGQueryGeneratorConfig,
-    content: InterleavedContent,
-    **kwargs,
-):
-    assert "inference_api" in kwargs, "LLMRAGQueryGenerator needs inference_api"
-    inference_api = kwargs["inference_api"]
-
-    messages = []
-    if isinstance(content, list):
-        messages = [interleaved_content_as_str(m) for m in content]
-    else:
-        messages = [interleaved_content_as_str(content)]
-
-    template = Template(config.template)
-    rendered_content: str = template.render({"messages": messages})
-
-    model = config.model
-    message = OpenAIUserMessageParam(content=rendered_content)
-    params = OpenAIChatCompletionRequestWithExtraBody(
-        model=model,
-        messages=[message],
-        stream=False,
-    )
-    response = await inference_api.openai_chat_completion(params)
-
-    query = response.choices[0].message.content
-
-    return query
diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py
deleted file mode 100644
index dc3dfbbca..000000000
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import asyncio
-import base64
-import io
-import mimetypes
-from typing import Any
-
-import httpx
-from fastapi import UploadFile
-from pydantic import TypeAdapter
-
-from llama_stack.apis.common.content_types import (
-    URL,
-    InterleavedContent,
-    InterleavedContentItem,
-    TextContentItem,
-)
-from llama_stack.apis.files import Files, OpenAIFilePurpose
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.tools import (
-    ListToolDefsResponse,
-    RAGDocument,
-    RAGQueryConfig,
-    RAGQueryResult,
-    RAGToolRuntime,
-    ToolDef,
-    ToolGroup,
-    ToolInvocationResult,
-    ToolRuntime,
-)
-from llama_stack.apis.vector_io import (
-    QueryChunksResponse,
-    VectorIO,
-    VectorStoreChunkingStrategyStatic,
-    VectorStoreChunkingStrategyStaticConfig,
-)
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
-from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
-from llama_stack.providers.utils.memory.vector_store import parse_data_url
-
-from .config import RagToolRuntimeConfig
-from .context_retriever import generate_rag_query
-
-log = get_logger(name=__name__, category="tool_runtime")
-
-
-async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]:
-    """Get raw binary data and mime type from a RAGDocument for file upload."""
-    if isinstance(doc.content, URL):
-        if doc.content.uri.startswith("data:"):
-            parts = parse_data_url(doc.content.uri)
-            mime_type = parts["mimetype"]
-            data = parts["data"]
-
-            if parts["is_base64"]:
-                file_data = base64.b64decode(data)
-            else:
-                file_data = data.encode("utf-8")
-
-            return file_data, mime_type
-        else:
-            async with httpx.AsyncClient() as client:
-                r = await client.get(doc.content.uri)
-                r.raise_for_status()
-                mime_type = r.headers.get("content-type", "application/octet-stream")
-                return r.content, mime_type
-    else:
-        if isinstance(doc.content, str):
-            content_str = doc.content
-        else:
-            content_str = interleaved_content_as_str(doc.content)
-
-        if content_str.startswith("data:"):
-            parts = parse_data_url(content_str)
-            mime_type = parts["mimetype"]
-            data = parts["data"]
-
-            if parts["is_base64"]:
-                file_data = base64.b64decode(data)
-            else:
-                file_data = data.encode("utf-8")
-
-            return file_data, mime_type
-        else:
-            return content_str.encode("utf-8"), "text/plain"
-
-
-class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRuntime):
-    def __init__(
-        self,
-        config: RagToolRuntimeConfig,
-        vector_io_api: VectorIO,
-        inference_api: Inference,
-        files_api: Files,
-    ):
-        self.config = config
-        self.vector_io_api = vector_io_api
-        self.inference_api = inference_api
-        self.files_api = files_api
-
-    async def initialize(self):
-        pass
-
-    async def shutdown(self):
-        pass
-
-    async def register_toolgroup(self, toolgroup: ToolGroup) -> None:
-        pass
-
-    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
-        return
-
-    async def insert(
-        self,
-        documents: list[RAGDocument],
-        vector_db_id: str,
-        chunk_size_in_tokens: int = 512,
-    ) -> None:
-        if not documents:
-            return
-
-        for doc in documents:
-            try:
-                try:
-                    file_data, mime_type = await raw_data_from_doc(doc)
-                except Exception as e:
-                    log.error(f"Failed to extract content from document {doc.document_id}: {e}")
-                    continue
-
-                file_extension = mimetypes.guess_extension(mime_type) or ".txt"
-                filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
-
-                file_obj = io.BytesIO(file_data)
-                file_obj.name = filename
-
-                upload_file = UploadFile(file=file_obj, filename=filename)
-
-                try:
-                    created_file = await self.files_api.openai_upload_file(
-                        file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
-                    )
-                except Exception as e:
-                    log.error(f"Failed to upload file for document {doc.document_id}: {e}")
-                    continue
-
-                chunking_strategy = VectorStoreChunkingStrategyStatic(
-                    static=VectorStoreChunkingStrategyStaticConfig(
-                        max_chunk_size_tokens=chunk_size_in_tokens,
-                        chunk_overlap_tokens=chunk_size_in_tokens // 4,
-                    )
-                )
-
-                try:
-                    await self.vector_io_api.openai_attach_file_to_vector_store(
-                        vector_store_id=vector_db_id,
-                        file_id=created_file.id,
-                        attributes=doc.metadata,
-                        chunking_strategy=chunking_strategy,
-                    )
-                except Exception as e:
-                    log.error(
-                        f"Failed to attach file {created_file.id} to vector store {vector_db_id} for document {doc.document_id}: {e}"
-                    )
-                    continue
-
-            except Exception as e:
-                log.error(f"Unexpected error processing document {doc.document_id}: {e}")
-                continue
-
-    async def query(
-        self,
-        content: InterleavedContent,
-        vector_db_ids: list[str],
-        query_config: RAGQueryConfig | None = None,
-    ) -> RAGQueryResult:
-        if not vector_db_ids:
-            raise ValueError(
-                "No vector DBs were provided to the knowledge search tool. Please provide at least one vector DB ID."
-            )
-
-        query_config = query_config or RAGQueryConfig()
-        query = await generate_rag_query(
-            query_config.query_generator_config,
-            content,
-            inference_api=self.inference_api,
-        )
-        tasks = [
-            self.vector_io_api.query_chunks(
-                vector_db_id=vector_db_id,
-                query=query,
-                params={
-                    "mode": query_config.mode,
-                    "max_chunks": query_config.max_chunks,
-                    "score_threshold": 0.0,
-                    "ranker": query_config.ranker,
-                },
-            )
-            for vector_db_id in vector_db_ids
-        ]
-        results: list[QueryChunksResponse] = await asyncio.gather(*tasks)
-
-        chunks = []
-        scores = []
-
-        for vector_db_id, result in zip(vector_db_ids, results, strict=False):
-            for chunk, score in zip(result.chunks, result.scores, strict=False):
-                if not hasattr(chunk, "metadata") or chunk.metadata is None:
-                    chunk.metadata = {}
-                chunk.metadata["vector_db_id"] = vector_db_id
-
-                chunks.append(chunk)
-                scores.append(score)
-
-        if not chunks:
-            return RAGQueryResult(content=None)
-
-        # sort by score
-        chunks, scores = zip(*sorted(zip(chunks, scores, strict=False), key=lambda x: x[1], reverse=True), strict=False)  # type: ignore
-        chunks = chunks[: query_config.max_chunks]
-
-        tokens = 0
-        picked: list[InterleavedContentItem] = [
-            TextContentItem(
-                text=f"knowledge_search tool found {len(chunks)} chunks:\nBEGIN of knowledge_search tool results.\n"
-            )
-        ]
-        for i, chunk in enumerate(chunks):
-            metadata = chunk.metadata
-            tokens += metadata.get("token_count", 0)
-            tokens += metadata.get("metadata_token_count", 0)
-
-            if tokens > query_config.max_tokens_in_context:
-                log.error(
-                    f"Using {len(picked)} chunks; reached max tokens in context: {tokens}",
-                )
-                break
-
-            # Add useful keys from chunk_metadata to metadata and remove some from metadata
-            chunk_metadata_keys_to_include_from_context = [
-                "chunk_id",
-                "document_id",
-                "source",
-            ]
-            metadata_keys_to_exclude_from_context = [
-                "token_count",
-                "metadata_token_count",
-                "vector_db_id",
-            ]
-            metadata_for_context = {}
-            for k in chunk_metadata_keys_to_include_from_context:
-                metadata_for_context[k] = getattr(chunk.chunk_metadata, k)
-            for k in metadata:
-                if k not in metadata_keys_to_exclude_from_context:
-                    metadata_for_context[k] = metadata[k]
-
-            text_content = query_config.chunk_template.format(index=i + 1, chunk=chunk, metadata=metadata_for_context)
-            picked.append(TextContentItem(text=text_content))
-
-        picked.append(TextContentItem(text="END of knowledge_search tool results.\n"))
-        picked.append(
-            TextContentItem(
-                text=f'The above results were retrieved to help answer the user\'s query: "{interleaved_content_as_str(content)}". Use them as supporting information only in answering this query.\n',
-            )
-        )
-
-        return RAGQueryResult(
-            content=picked,
-            metadata={
-                "document_ids": [c.document_id for c in chunks[: len(picked)]],
-                "chunks": [c.content for c in chunks[: len(picked)]],
-                "scores": scores[: len(picked)],
-                "vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]],
-            },
-        )
-
-    async def list_runtime_tools(
-        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
-    ) -> ListToolDefsResponse:
-        # Parameters are not listed since these methods are not yet invoked automatically
-        # by the LLM. The method is only implemented so things like /tools can list without
-        # encountering fatals.
-        return ListToolDefsResponse(
-            data=[
-                ToolDef(
-                    name="insert_into_memory",
-                    description="Insert documents into memory",
-                ),
-                ToolDef(
-                    name="knowledge_search",
-                    description="Search for information in a database.",
-                    input_schema={
-                        "type": "object",
-                        "properties": {
-                            "query": {
-                                "type": "string",
-                                "description": "The query to search for. Can be a natural language sentence or keywords.",
-                            }
-                        },
-                        "required": ["query"],
-                    },
-                ),
-            ]
-        )
-
-    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
-        vector_db_ids = kwargs.get("vector_db_ids", [])
-        query_config = kwargs.get("query_config")
-        if query_config:
-            query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config)
-        else:
-            query_config = RAGQueryConfig()
-
-        query = kwargs["query"]
-        result = await self.query(
-            content=query,
-            vector_db_ids=vector_db_ids,
-            query_config=query_config,
-        )
-
-        return ToolInvocationResult(
-            content=result.content or [],
-            metadata={
-                **(result.metadata or {}),
-                "citation_files": getattr(result, "citation_files", None),
-            },
-        )
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index 35afb296d..2e52e2d12 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -42,6 +42,7 @@ def available_providers() -> list[ProviderSpec]:
             # CrossEncoder depends on torchao.quantization
             pip_packages=[
                 "torch torchvision torchao>=0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu",
+                "numpy tqdm transformers",
                 "sentence-transformers --no-deps",
                 # required by some SentenceTransformers architectures for tensor rearrange/merge ops
                 "einops",
diff --git a/llama_stack/providers/registry/tool_runtime.py b/llama_stack/providers/registry/tool_runtime.py
index 39dc7fccd..514d9d0a0 100644
--- a/llama_stack/providers/registry/tool_runtime.py
+++ b/llama_stack/providers/registry/tool_runtime.py
@@ -7,33 +7,13 @@
 
 from llama_stack.providers.datatypes import (
     Api,
-    InlineProviderSpec,
     ProviderSpec,
     RemoteProviderSpec,
 )
-from llama_stack.providers.registry.vector_io import DEFAULT_VECTOR_IO_DEPS
 
 
 def available_providers() -> list[ProviderSpec]:
     return [
-        InlineProviderSpec(
-            api=Api.tool_runtime,
-            provider_type="inline::rag-runtime",
-            pip_packages=DEFAULT_VECTOR_IO_DEPS
-            + [
-                "tqdm",
-                "numpy",
-                "scikit-learn",
-                "scipy",
-                "nltk",
-                "sentencepiece",
-                "transformers",
-            ],
-            module="llama_stack.providers.inline.tool_runtime.rag",
-            config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig",
-            api_dependencies=[Api.vector_io, Api.inference, Api.files],
-            description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.",
-        ),
         RemoteProviderSpec(
             api=Api.tool_runtime,
             adapter_type="brave-search",
diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py
index ff3b8486f..db81ea35d 100644
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@@ -119,7 +119,7 @@ Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, i
 #### Empirical Example
 
 Consider the histogram below in which 10,000 randomly generated strings were inserted
-in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
+in batches of 100 into both Faiss and sqlite-vec.
 
 ```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
 :alt: Comparison of SQLite-Vec and Faiss write times
diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py
index 6c8746e92..9e9c9a08a 100644
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@@ -12,17 +12,14 @@ from dataclasses import dataclass
 from typing import Any
 from urllib.parse import unquote
 
-import httpx
 import numpy as np
 from numpy.typing import NDArray
 from pydantic import BaseModel
 
 from llama_stack.apis.common.content_types import (
-    URL,
     InterleavedContent,
 )
 from llama_stack.apis.inference import OpenAIEmbeddingsRequestWithExtraBody
-from llama_stack.apis.tools import RAGDocument
 from llama_stack.apis.vector_io import Chunk, ChunkMetadata, QueryChunksResponse
 from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.log import get_logger
@@ -129,31 +126,6 @@ def content_from_data_and_mime_type(data: bytes | str, mime_type: str | None, en
         return ""
 
 
-async def content_from_doc(doc: RAGDocument) -> str:
-    if isinstance(doc.content, URL):
-        if doc.content.uri.startswith("data:"):
-            return content_from_data(doc.content.uri)
-        async with httpx.AsyncClient() as client:
-            r = await client.get(doc.content.uri)
-        if doc.mime_type == "application/pdf":
-            return parse_pdf(r.content)
-        return r.text
-    elif isinstance(doc.content, str):
-        pattern = re.compile("^(https?://|file://|data:)")
-        if pattern.match(doc.content):
-            if doc.content.startswith("data:"):
-                return content_from_data(doc.content)
-            async with httpx.AsyncClient() as client:
-                r = await client.get(doc.content)
-            if doc.mime_type == "application/pdf":
-                return parse_pdf(r.content)
-            return r.text
-        return doc.content
-    else:
-        # will raise ValueError if the content is not List[InterleavedContent] or InterleavedContent
-        return interleaved_content_as_str(doc.content)
-
-
 def make_overlapped_chunks(
     document_id: str, text: str, window_len: int, overlap_len: int, metadata: dict[str, Any]
 ) -> list[Chunk]:
diff --git a/tests/unit/providers/utils/memory/test_vector_store.py b/tests/unit/providers/utils/memory/test_vector_store.py
index 590bdd1d2..3a5cd5bf7 100644
--- a/tests/unit/providers/utils/memory/test_vector_store.py
+++ b/tests/unit/providers/utils/memory/test_vector_store.py
@@ -4,138 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import patch
 
 import pytest
 
-from llama_stack.apis.common.content_types import URL, TextContentItem
-from llama_stack.apis.tools import RAGDocument
-from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, content_from_doc
-
-
-async def test_content_from_doc_with_url():
-    """Test extracting content from RAGDocument with URL content."""
-    mock_url = URL(uri="https://example.com")
-    mock_doc = RAGDocument(document_id="foo", content=mock_url)
-
-    mock_response = MagicMock()
-    mock_response.text = "Sample content from URL"
-
-    with patch("httpx.AsyncClient") as mock_client:
-        mock_instance = AsyncMock()
-        mock_instance.get.return_value = mock_response
-        mock_client.return_value.__aenter__.return_value = mock_instance
-
-        result = await content_from_doc(mock_doc)
-
-        assert result == "Sample content from URL"
-        mock_instance.get.assert_called_once_with(mock_url.uri)
-
-
-async def test_content_from_doc_with_pdf_url():
-    """Test extracting content from RAGDocument with URL pointing to a PDF."""
-    mock_url = URL(uri="https://example.com/document.pdf")
-    mock_doc = RAGDocument(document_id="foo", content=mock_url, mime_type="application/pdf")
-
-    mock_response = MagicMock()
-    mock_response.content = b"PDF binary data"
-
-    with (
-        patch("httpx.AsyncClient") as mock_client,
-        patch("llama_stack.providers.utils.memory.vector_store.parse_pdf") as mock_parse_pdf,
-    ):
-        mock_instance = AsyncMock()
-        mock_instance.get.return_value = mock_response
-        mock_client.return_value.__aenter__.return_value = mock_instance
-        mock_parse_pdf.return_value = "Extracted PDF content"
-
-        result = await content_from_doc(mock_doc)
-
-        assert result == "Extracted PDF content"
-        mock_instance.get.assert_called_once_with(mock_url.uri)
-        mock_parse_pdf.assert_called_once_with(b"PDF binary data")
-
-
-async def test_content_from_doc_with_data_url():
-    """Test extracting content from RAGDocument with data URL content."""
-    data_url = "data:text/plain;base64,SGVsbG8gV29ybGQ="  # "Hello World" base64 encoded
-    mock_url = URL(uri=data_url)
-    mock_doc = RAGDocument(document_id="foo", content=mock_url)
-
-    with patch("llama_stack.providers.utils.memory.vector_store.content_from_data") as mock_content_from_data:
-        mock_content_from_data.return_value = "Hello World"
-
-        result = await content_from_doc(mock_doc)
-
-        assert result == "Hello World"
-        mock_content_from_data.assert_called_once_with(data_url)
-
-
-async def test_content_from_doc_with_string():
-    """Test extracting content from RAGDocument with string content."""
-    content_string = "This is plain text content"
-    mock_doc = RAGDocument(document_id="foo", content=content_string)
-
-    result = await content_from_doc(mock_doc)
-
-    assert result == content_string
-
-
-async def test_content_from_doc_with_string_url():
-    """Test extracting content from RAGDocument with string URL content."""
-    url_string = "https://example.com"
-    mock_doc = RAGDocument(document_id="foo", content=url_string)
-
-    mock_response = MagicMock()
-    mock_response.text = "Sample content from URL string"
-
-    with patch("httpx.AsyncClient") as mock_client:
-        mock_instance = AsyncMock()
-        mock_instance.get.return_value = mock_response
-        mock_client.return_value.__aenter__.return_value = mock_instance
-
-        result = await content_from_doc(mock_doc)
-
-        assert result == "Sample content from URL string"
-        mock_instance.get.assert_called_once_with(url_string)
-
-
-async def test_content_from_doc_with_string_pdf_url():
-    """Test extracting content from RAGDocument with string URL pointing to a PDF."""
-    url_string = "https://example.com/document.pdf"
-    mock_doc = RAGDocument(document_id="foo", content=url_string, mime_type="application/pdf")
-
-    mock_response = MagicMock()
-    mock_response.content = b"PDF binary data"
-
-    with (
-        patch("httpx.AsyncClient") as mock_client,
-        patch("llama_stack.providers.utils.memory.vector_store.parse_pdf") as mock_parse_pdf,
-    ):
-        mock_instance = AsyncMock()
-        mock_instance.get.return_value = mock_response
-        mock_client.return_value.__aenter__.return_value = mock_instance
-        mock_parse_pdf.return_value = "Extracted PDF content from string URL"
-
-        result = await content_from_doc(mock_doc)
-
-        assert result == "Extracted PDF content from string URL"
-        mock_instance.get.assert_called_once_with(url_string)
-        mock_parse_pdf.assert_called_once_with(b"PDF binary data")
-
-
-async def test_content_from_doc_with_interleaved_content():
-    """Test extracting content from RAGDocument with InterleavedContent (the new case added in the commit)."""
-    interleaved_content = [TextContentItem(text="First item"), TextContentItem(text="Second item")]
-    mock_doc = RAGDocument(document_id="foo", content=interleaved_content)
-
-    with patch("llama_stack.providers.utils.memory.vector_store.interleaved_content_as_str") as mock_interleaved:
-        mock_interleaved.return_value = "First item\nSecond item"
-
-        result = await content_from_doc(mock_doc)
-
-        assert result == "First item\nSecond item"
-        mock_interleaved.assert_called_once_with(interleaved_content)
+from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type
 
 
 def test_content_from_data_and_mime_type_success_utf8():
@@ -178,41 +51,3 @@ def test_content_from_data_and_mime_type_both_encodings_fail():
         # Should raise an exception instead of returning empty string
         with pytest.raises(UnicodeDecodeError):
             content_from_data_and_mime_type(data, mime_type)
-
-
-async def test_memory_tool_error_handling():
-    """Test that memory tool handles various failures gracefully without crashing."""
-    from llama_stack.providers.inline.tool_runtime.rag.config import RagToolRuntimeConfig
-    from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRuntimeImpl
-
-    config = RagToolRuntimeConfig()
-    memory_tool = MemoryToolRuntimeImpl(
-        config=config,
-        vector_io_api=AsyncMock(),
-        inference_api=AsyncMock(),
-        files_api=AsyncMock(),
-    )
-
-    docs = [
-        RAGDocument(document_id="good_doc", content="Good content", metadata={}),
-        RAGDocument(document_id="bad_url_doc", content=URL(uri="https://bad.url"), metadata={}),
-        RAGDocument(document_id="another_good_doc", content="Another good content", metadata={}),
-    ]
-
-    mock_file1 = MagicMock()
-    mock_file1.id = "file_good1"
-    mock_file2 = MagicMock()
-    mock_file2.id = "file_good2"
-    memory_tool.files_api.openai_upload_file.side_effect = [mock_file1, mock_file2]
-
-    with patch("httpx.AsyncClient") as mock_client:
-        mock_instance = AsyncMock()
-        mock_instance.get.side_effect = Exception("Bad URL")
-        mock_client.return_value.__aenter__.return_value = mock_instance
-
-        # won't raise exception despite one document failing
-        await memory_tool.insert(docs, "vector_store_123")
-
-    # processed 2 documents successfully, skipped 1
-    assert memory_tool.files_api.openai_upload_file.call_count == 2
-    assert memory_tool.vector_io_api.openai_attach_file_to_vector_store.call_count == 2
diff --git a/tests/unit/rag/test_rag_query.py b/tests/unit/rag/test_rag_query.py
deleted file mode 100644
index c012bc4f0..000000000
--- a/tests/unit/rag/test_rag_query.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from unittest.mock import AsyncMock, MagicMock
-
-import pytest
-
-from llama_stack.apis.tools.rag_tool import RAGQueryConfig
-from llama_stack.apis.vector_io import (
-    Chunk,
-    ChunkMetadata,
-    QueryChunksResponse,
-)
-from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRuntimeImpl
-
-
-class TestRagQuery:
-    async def test_query_raises_on_empty_vector_store_ids(self):
-        rag_tool = MemoryToolRuntimeImpl(
-            config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock(), files_api=MagicMock()
-        )
-        with pytest.raises(ValueError):
-            await rag_tool.query(content=MagicMock(), vector_db_ids=[])
-
-    async def test_query_chunk_metadata_handling(self):
-        rag_tool = MemoryToolRuntimeImpl(
-            config=MagicMock(), vector_io_api=MagicMock(), inference_api=MagicMock(), files_api=MagicMock()
-        )
-        content = "test query content"
-        vector_db_ids = ["db1"]
-
-        chunk_metadata = ChunkMetadata(
-            document_id="doc1",
-            chunk_id="chunk1",
-            source="test_source",
-            metadata_token_count=5,
-        )
-        interleaved_content = MagicMock()
-        chunk = Chunk(
-            content=interleaved_content,
-            metadata={
-                "key1": "value1",
-                "token_count": 10,
-                "metadata_token_count": 5,
-                # Note this is inserted into `metadata` during MemoryToolRuntimeImpl().insert()
-                "document_id": "doc1",
-            },
-            stored_chunk_id="chunk1",
-            chunk_metadata=chunk_metadata,
-        )
-
-        query_response = QueryChunksResponse(chunks=[chunk], scores=[1.0])
-
-        rag_tool.vector_io_api.query_chunks = AsyncMock(return_value=query_response)
-        result = await rag_tool.query(content=content, vector_db_ids=vector_db_ids)
-
-        assert result is not None
-        expected_metadata_string = (
-            "Metadata: {'chunk_id': 'chunk1', 'document_id': 'doc1', 'source': 'test_source', 'key1': 'value1'}"
-        )
-        assert expected_metadata_string in result.content[1].text
-        assert result.content is not None
-
-    async def test_query_raises_incorrect_mode(self):
-        with pytest.raises(ValueError):
-            RAGQueryConfig(mode="invalid_mode")
-
-    async def test_query_accepts_valid_modes(self):
-        default_config = RAGQueryConfig()  # Test default (vector)
-        assert default_config.mode == "vector"
-        vector_config = RAGQueryConfig(mode="vector")  # Test vector
-        assert vector_config.mode == "vector"
-        keyword_config = RAGQueryConfig(mode="keyword")  # Test keyword
-        assert keyword_config.mode == "keyword"
-        hybrid_config = RAGQueryConfig(mode="hybrid")  # Test hybrid
-        assert hybrid_config.mode == "hybrid"
-
-        # Test that invalid mode raises an error
-        with pytest.raises(ValueError):
-            RAGQueryConfig(mode="wrong_mode")
-
-    async def test_query_adds_vector_store_id_to_chunk_metadata(self):
-        rag_tool = MemoryToolRuntimeImpl(
-            config=MagicMock(),
-            vector_io_api=MagicMock(),
-            inference_api=MagicMock(),
-            files_api=MagicMock(),
-        )
-
-        vector_db_ids = ["db1", "db2"]
-
-        # Fake chunks from each DB
-        chunk_metadata1 = ChunkMetadata(
-            document_id="doc1",
-            chunk_id="chunk1",
-            source="test_source1",
-            metadata_token_count=5,
-        )
-        chunk1 = Chunk(
-            content="chunk from db1",
-            metadata={"vector_db_id": "db1", "document_id": "doc1"},
-            stored_chunk_id="c1",
-            chunk_metadata=chunk_metadata1,
-        )
-
-        chunk_metadata2 = ChunkMetadata(
-            document_id="doc2",
-            chunk_id="chunk2",
-            source="test_source2",
-            metadata_token_count=5,
-        )
-        chunk2 = Chunk(
-            content="chunk from db2",
-            metadata={"vector_db_id": "db2", "document_id": "doc2"},
-            stored_chunk_id="c2",
-            chunk_metadata=chunk_metadata2,
-        )
-
-        rag_tool.vector_io_api.query_chunks = AsyncMock(
-            side_effect=[
-                QueryChunksResponse(chunks=[chunk1], scores=[0.9]),
-                QueryChunksResponse(chunks=[chunk2], scores=[0.8]),
-            ]
-        )
-
-        result = await rag_tool.query(content="test", vector_db_ids=vector_db_ids)
-        returned_chunks = result.metadata["chunks"]
-        returned_scores = result.metadata["scores"]
-        returned_doc_ids = result.metadata["document_ids"]
-        returned_vector_db_ids = result.metadata["vector_db_ids"]
-
-        assert returned_chunks == ["chunk from db1", "chunk from db2"]
-        assert returned_scores == (0.9, 0.8)
-        assert returned_doc_ids == ["doc1", "doc2"]
-        assert returned_vector_db_ids == ["db1", "db2"]
diff --git a/tests/unit/rag/test_vector_store.py b/tests/unit/rag/test_vector_store.py
index 200da5c26..e185b83e7 100644
--- a/tests/unit/rag/test_vector_store.py
+++ b/tests/unit/rag/test_vector_store.py
@@ -4,10 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import base64
-import mimetypes
-import os
-from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock
 
 import numpy as np
@@ -17,37 +13,13 @@ from llama_stack.apis.inference.inference import (
     OpenAIEmbeddingData,
     OpenAIEmbeddingsRequestWithExtraBody,
 )
-from llama_stack.apis.tools import RAGDocument
 from llama_stack.apis.vector_io import Chunk
 from llama_stack.providers.utils.memory.vector_store import (
-    URL,
     VectorStoreWithIndex,
     _validate_embedding,
-    content_from_doc,
     make_overlapped_chunks,
 )
 
-DUMMY_PDF_PATH = Path(os.path.abspath(__file__)).parent / "fixtures" / "dummy.pdf"
-# Depending on the machine, this can get parsed a couple of ways
-DUMMY_PDF_TEXT_CHOICES = ["Dummy PDF file", "Dumm y PDF file"]
-
-
-def read_file(file_path: str) -> bytes:
-    with open(file_path, "rb") as file:
-        return file.read()
-
-
-def data_url_from_file(file_path: str) -> str:
-    with open(file_path, "rb") as file:
-        file_content = file.read()
-
-    base64_content = base64.b64encode(file_content).decode("utf-8")
-    mime_type, _ = mimetypes.guess_type(file_path)
-
-    data_url = f"data:{mime_type};base64,{base64_content}"
-
-    return data_url
-
 
 class TestChunk:
     def test_chunk(self):
@@ -116,45 +88,6 @@ class TestValidateEmbedding:
 
 
 class TestVectorStore:
-    async def test_returns_content_from_pdf_data_uri(self):
-        data_uri = data_url_from_file(DUMMY_PDF_PATH)
-        doc = RAGDocument(
-            document_id="dummy",
-            content=data_uri,
-            mime_type="application/pdf",
-            metadata={},
-        )
-        content = await content_from_doc(doc)
-        assert content in DUMMY_PDF_TEXT_CHOICES
-
-    @pytest.mark.allow_network
-    async def test_downloads_pdf_and_returns_content(self):
-        # Using GitHub to host the PDF file
-        url = "https://raw.githubusercontent.com/meta-llama/llama-stack/da035d69cfca915318eaf485770a467ca3c2a238/llama_stack/providers/tests/memory/fixtures/dummy.pdf"
-        doc = RAGDocument(
-            document_id="dummy",
-            content=url,
-            mime_type="application/pdf",
-            metadata={},
-        )
-        content = await content_from_doc(doc)
-        assert content in DUMMY_PDF_TEXT_CHOICES
-
-    @pytest.mark.allow_network
-    async def test_downloads_pdf_and_returns_content_with_url_object(self):
-        # Using GitHub to host the PDF file
-        url = "https://raw.githubusercontent.com/meta-llama/llama-stack/da035d69cfca915318eaf485770a467ca3c2a238/llama_stack/providers/tests/memory/fixtures/dummy.pdf"
-        doc = RAGDocument(
-            document_id="dummy",
-            content=URL(
-                uri=url,
-            ),
-            mime_type="application/pdf",
-            metadata={},
-        )
-        content = await content_from_doc(doc)
-        assert content in DUMMY_PDF_TEXT_CHOICES
-
     @pytest.mark.parametrize(
         "window_len, overlap_len, expected_chunks",
         [