diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 96de04ec9..aef066f11 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -7047,6 +7047,9 @@
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
},
+ {
+ "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
+ },
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
},
@@ -7193,7 +7196,7 @@
"const": "file_search",
"default": "file_search"
},
- "vector_store_id": {
+ "vector_store_ids": {
"type": "array",
"items": {
"type": "string"
@@ -7217,7 +7220,7 @@
"additionalProperties": false,
"required": [
"type",
- "vector_store_id"
+ "vector_store_ids"
],
"title": "OpenAIResponseInputToolFileSearch"
},
@@ -7484,6 +7487,64 @@
],
"title": "OpenAIResponseOutputMessageContentOutputText"
},
+ "OpenAIResponseOutputMessageFileSearchToolCall": {
+ "type": "object",
+ "properties": {
+ "id": {
+ "type": "string"
+ },
+ "queries": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "status": {
+ "type": "string"
+ },
+ "type": {
+ "type": "string",
+ "const": "file_search_call",
+ "default": "file_search_call"
+ },
+ "results": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": {
+ "oneOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number"
+ },
+ {
+ "type": "string"
+ },
+ {
+ "type": "array"
+ },
+ {
+ "type": "object"
+ }
+ ]
+ }
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "id",
+ "queries",
+ "status",
+ "type"
+ ],
+ "title": "OpenAIResponseOutputMessageFileSearchToolCall"
+ },
"OpenAIResponseOutputMessageFunctionToolCall": {
"type": "object",
"properties": {
@@ -7760,6 +7821,9 @@
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
},
+ {
+ "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
+ },
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
},
@@ -7775,6 +7839,7 @@
"mapping": {
"message": "#/components/schemas/OpenAIResponseMessage",
"web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall",
+ "file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall",
"function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall",
"mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall",
"mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index b2fe870be..4154a430d 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -5021,6 +5021,7 @@ components:
OpenAIResponseInput:
oneOf:
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+ - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
- $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
- $ref: '#/components/schemas/OpenAIResponseMessage'
@@ -5115,7 +5116,7 @@ components:
type: string
const: file_search
default: file_search
- vector_store_id:
+ vector_store_ids:
type: array
items:
type: string
@@ -5132,7 +5133,7 @@ components:
additionalProperties: false
required:
- type
- - vector_store_id
+ - vector_store_ids
title: OpenAIResponseInputToolFileSearch
OpenAIResponseInputToolFunction:
type: object
@@ -5294,6 +5295,41 @@ components:
- type
title: >-
OpenAIResponseOutputMessageContentOutputText
+ "OpenAIResponseOutputMessageFileSearchToolCall":
+ type: object
+ properties:
+ id:
+ type: string
+ queries:
+ type: array
+ items:
+ type: string
+ status:
+ type: string
+ type:
+ type: string
+ const: file_search_call
+ default: file_search_call
+ results:
+ type: array
+ items:
+ type: object
+ additionalProperties:
+ oneOf:
+ - type: 'null'
+ - type: boolean
+ - type: number
+ - type: string
+ - type: array
+ - type: object
+ additionalProperties: false
+ required:
+ - id
+ - queries
+ - status
+ - type
+ title: >-
+ OpenAIResponseOutputMessageFileSearchToolCall
"OpenAIResponseOutputMessageFunctionToolCall":
type: object
properties:
@@ -5491,6 +5527,7 @@ components:
oneOf:
- $ref: '#/components/schemas/OpenAIResponseMessage'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+ - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
@@ -5499,6 +5536,7 @@ components:
mapping:
message: '#/components/schemas/OpenAIResponseMessage'
web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+ file_search_call: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py
index 35b3d5ace..bdd9c3e26 100644
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@@ -81,6 +81,15 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
type: Literal["web_search_call"] = "web_search_call"
+@json_schema_type
+class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
+ id: str
+ queries: list[str]
+ status: str
+ type: Literal["file_search_call"] = "file_search_call"
+ results: list[dict[str, Any]] | None = None
+
+
@json_schema_type
class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
call_id: str
@@ -119,6 +128,7 @@ class OpenAIResponseOutputMessageMCPListTools(BaseModel):
OpenAIResponseOutput = Annotated[
OpenAIResponseMessage
| OpenAIResponseOutputMessageWebSearchToolCall
+ | OpenAIResponseOutputMessageFileSearchToolCall
| OpenAIResponseOutputMessageFunctionToolCall
| OpenAIResponseOutputMessageMCPCall
| OpenAIResponseOutputMessageMCPListTools,
@@ -362,6 +372,7 @@ class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
OpenAIResponseInput = Annotated[
# Responses API allows output messages to be passed in as input
OpenAIResponseOutputMessageWebSearchToolCall
+ | OpenAIResponseOutputMessageFileSearchToolCall
| OpenAIResponseOutputMessageFunctionToolCall
| OpenAIResponseInputFunctionToolCallOutput
|
@@ -397,9 +408,9 @@ class FileSearchRankingOptions(BaseModel):
@json_schema_type
class OpenAIResponseInputToolFileSearch(BaseModel):
type: Literal["file_search"] = "file_search"
- vector_store_id: list[str]
+ vector_store_ids: list[str]
ranking_options: FileSearchRankingOptions | None = None
- # TODO: add filters
+ # TODO: add filters, max_num_results
class ApprovalFilter(BaseModel):
diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
index 0ff6dc2c5..963dd1ddd 100644
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@@ -24,6 +24,7 @@ from llama_stack.apis.agents.openai_responses import (
OpenAIResponseInputMessageContentImage,
OpenAIResponseInputMessageContentText,
OpenAIResponseInputTool,
+ OpenAIResponseInputToolFileSearch,
OpenAIResponseInputToolMCP,
OpenAIResponseMessage,
OpenAIResponseObject,
@@ -34,6 +35,7 @@ from llama_stack.apis.agents.openai_responses import (
OpenAIResponseOutput,
OpenAIResponseOutputMessageContent,
OpenAIResponseOutputMessageContentOutputText,
+ OpenAIResponseOutputMessageFileSearchToolCall,
OpenAIResponseOutputMessageFunctionToolCall,
OpenAIResponseOutputMessageMCPListTools,
OpenAIResponseOutputMessageWebSearchToolCall,
@@ -198,7 +200,8 @@ class OpenAIResponsePreviousResponseWithInputItems(BaseModel):
class ChatCompletionContext(BaseModel):
model: str
messages: list[OpenAIMessageParam]
- tools: list[ChatCompletionToolParam] | None = None
+ response_tools: list[OpenAIResponseInputTool] | None = None
+ chat_tools: list[ChatCompletionToolParam] | None = None
mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
temperature: float | None
response_format: OpenAIResponseFormatParam
@@ -388,7 +391,8 @@ class OpenAIResponsesImpl:
ctx = ChatCompletionContext(
model=model,
messages=messages,
- tools=chat_tools,
+ response_tools=tools,
+ chat_tools=chat_tools,
mcp_tool_to_server=mcp_tool_to_server,
temperature=temperature,
response_format=response_format,
@@ -417,7 +421,7 @@ class OpenAIResponsesImpl:
completion_result = await self.inference_api.openai_chat_completion(
model=ctx.model,
messages=messages,
- tools=ctx.tools,
+ tools=ctx.chat_tools,
stream=True,
temperature=ctx.temperature,
response_format=ctx.response_format,
@@ -606,6 +610,12 @@ class OpenAIResponsesImpl:
if not tool:
raise ValueError(f"Tool {tool_name} not found")
chat_tools.append(make_openai_tool(tool_name, tool))
+ elif input_tool.type == "file_search":
+ tool_name = "knowledge_search"
+ tool = await self.tool_groups_api.get_tool(tool_name)
+ if not tool:
+ raise ValueError(f"Tool {tool_name} not found")
+ chat_tools.append(make_openai_tool(tool_name, tool))
elif input_tool.type == "mcp":
always_allowed = None
never_allowed = None
@@ -667,6 +677,7 @@ class OpenAIResponsesImpl:
tool_call_id = tool_call.id
function = tool_call.function
+ tool_kwargs = json.loads(function.arguments) if function.arguments else {}
if not function or not tool_call_id or not function.name:
return None, None
@@ -680,12 +691,18 @@ class OpenAIResponsesImpl:
endpoint=mcp_tool.server_url,
headers=mcp_tool.headers or {},
tool_name=function.name,
- kwargs=json.loads(function.arguments) if function.arguments else {},
+ kwargs=tool_kwargs,
)
else:
+ if function.name == "knowledge_search":
+ response_file_search_tool = next(
+ t for t in ctx.response_tools if isinstance(t, OpenAIResponseInputToolFileSearch)
+ )
+ if response_file_search_tool:
+ tool_kwargs["vector_db_ids"] = response_file_search_tool.vector_store_ids
result = await self.tool_runtime_api.invoke_tool(
tool_name=function.name,
- kwargs=json.loads(function.arguments) if function.arguments else {},
+ kwargs=tool_kwargs,
)
except Exception as e:
error_exc = e
@@ -713,6 +730,27 @@ class OpenAIResponsesImpl:
)
if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
message.status = "failed"
+ elif function.name == "knowledge_search":
+ message = OpenAIResponseOutputMessageFileSearchToolCall(
+ id=tool_call_id,
+ queries=[tool_kwargs.get("query", "")],
+ status="completed",
+ )
+ if "document_ids" in result.metadata:
+ message.results = []
+ for i, doc_id in enumerate(result.metadata["document_ids"]):
+ text = result.metadata["chunks"][i] if "chunks" in result.metadata else None
+ score = result.metadata["scores"][i] if "scores" in result.metadata else None
+ message.results.append(
+ {
+ "file_id": doc_id,
+ "filename": doc_id,
+ "text": text,
+ "score": score,
+ }
+ )
+ if error_exc or (result.error_code and result.error_code > 0) or result.error_message:
+ message.status = "failed"
else:
raise ValueError(f"Unknown tool {function.name} called")
diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py
index 4776d47d0..e15d067a7 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@@ -170,6 +170,8 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
content=picked,
metadata={
"document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
+ "chunks": [c.content for c in chunks[: len(picked)]],
+ "scores": scores[: len(picked)],
},
)
diff --git a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
index 4d6c19b59..7115e4b50 100644
--- a/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
+++ b/tests/verifications/openai_api/fixtures/test_cases/responses.yaml
@@ -31,6 +31,18 @@ test_response_web_search:
search_context_size: "low"
output: "128"
+test_response_file_search:
+ test_name: test_response_file_search
+ test_params:
+ case:
+ - case_id: "llama_experts"
+ input: "How many experts does the Llama 4 Maverick model have?"
+ tools:
+ - type: file_search
+ vector_store_ids:
+ - test_vector_store
+ output: "128"
+
test_response_mcp_tool:
test_name: test_response_mcp_tool
test_params:
diff --git a/tests/verifications/openai_api/test_responses.py b/tests/verifications/openai_api/test_responses.py
index 28020d3b1..86b267fac 100644
--- a/tests/verifications/openai_api/test_responses.py
+++ b/tests/verifications/openai_api/test_responses.py
@@ -9,6 +9,7 @@ import json
import httpx
import openai
import pytest
+from llama_stack_client import LlamaStackClient
from llama_stack import LlamaStackAsLibraryClient
from llama_stack.distribution.datatypes import AuthenticationRequiredError
@@ -258,6 +259,62 @@ def test_response_non_streaming_web_search(request, openai_client, model, provid
assert case["output"].lower() in response.output_text.lower().strip()
+@pytest.mark.parametrize(
+ "case",
+ responses_test_cases["test_response_file_search"]["test_params"]["case"],
+ ids=case_id_generator,
+)
+def test_response_non_streaming_file_search(
+ base_url, request, openai_client, model, provider, verification_config, case
+):
+ test_name_base = get_base_test_name(request)
+ if should_skip_test(verification_config, provider, model, test_name_base):
+ pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
+
+ lls_client = LlamaStackClient(base_url=base_url.replace("/v1/openai/v1", ""))
+ vector_db_id = "test_vector_store"
+
+ # Ensure the test starts from a clean vector store
+ try:
+ lls_client.vector_dbs.unregister(vector_db_id=vector_db_id)
+ except Exception:
+ pass
+
+ lls_client.vector_dbs.register(
+ vector_db_id=vector_db_id,
+ embedding_model="all-MiniLM-L6-v2",
+ )
+ doc_content = "Llama 4 Maverick has 128 experts"
+ chunks = [
+ {
+ "content": doc_content,
+ "mime_type": "text/plain",
+ "metadata": {
+ "document_id": "doc1",
+ },
+ },
+ ]
+ lls_client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
+
+ response = openai_client.responses.create(
+ model=model,
+ input=case["input"],
+ tools=case["tools"],
+ stream=False,
+ )
+ assert len(response.output) > 1
+ assert response.output[0].type == "file_search_call"
+ assert response.output[0].status == "completed"
+ assert response.output[0].queries # ensure it's some non-empty list
+ assert response.output[0].results[0].text == doc_content
+ assert response.output[0].results[0].score > 0
+ assert response.output[1].type == "message"
+ assert response.output[1].status == "completed"
+ assert response.output[1].role == "assistant"
+ assert len(response.output[1].content) > 0
+ assert case["output"].lower() in response.output_text.lower().strip()
+
+
@pytest.mark.parametrize(
"case",
responses_test_cases["test_response_mcp_tool"]["test_params"]["case"],