chore(cleanup)!: remove tool_runtime.rag_tool (#3871)

Kill the `builtin::rag` tool group completely since it is no longer targeted. We use the Responses implementation for knowledge_search which uses the `openai_vector_stores` pathway. --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
2025-12-08 03:00:56 +00:00 · 2025-10-20 22:26:21 -07:00 · 2025-10-20 22:26:21 -07:00 · 0e96279bee
commit 0e96279bee
parent 5aaf1a8bca
55 changed files with 17 additions and 3114 deletions
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -42,6 +42,7 @@ def available_providers() -> list[ProviderSpec]:
            # CrossEncoder depends on torchao.quantization
            pip_packages=[
                "torch torchvision torchao>=0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu",
+                "numpy tqdm transformers",
                "sentence-transformers --no-deps",
                # required by some SentenceTransformers architectures for tensor rearrange/merge ops
                "einops",
--- a/llama_stack/providers/registry/tool_runtime.py
+++ b/llama_stack/providers/registry/tool_runtime.py
@ -7,33 +7,13 @@

 from llama_stack.providers.datatypes import (
    Api,
-    InlineProviderSpec,
    ProviderSpec,
    RemoteProviderSpec,
 )
-from llama_stack.providers.registry.vector_io import DEFAULT_VECTOR_IO_DEPS


 def available_providers() -> list[ProviderSpec]:
    return [
-        InlineProviderSpec(
-            api=Api.tool_runtime,
-            provider_type="inline::rag-runtime",
-            pip_packages=DEFAULT_VECTOR_IO_DEPS
-            + [
-                "tqdm",
-                "numpy",
-                "scikit-learn",
-                "scipy",
-                "nltk",
-                "sentencepiece",
-                "transformers",
-            ],
-            module="llama_stack.providers.inline.tool_runtime.rag",
-            config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig",
-            api_dependencies=[Api.vector_io, Api.inference, Api.files],
-            description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.",
-        ),
        RemoteProviderSpec(
            api=Api.tool_runtime,
            adapter_type="brave-search",
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@ -119,7 +119,7 @@ Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, i
 #### Empirical Example

 Consider the histogram below in which 10,000 randomly generated strings were inserted
-in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
+in batches of 100 into both Faiss and sqlite-vec.

 ```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
 :alt: Comparison of SQLite-Vec and Faiss write times