mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
# What does this PR do? This is an initial working prototype of wiring up the `file_search` builtin tool for the Responses API to our existing rag knowledge search tool. This is me seeing what I could pull together on top of the bits we already have merged. This may not be the ideal way to implement this, and things like how I shuffle the vector store ids from the original response API tool request to the actual tool execution feel a bit hacky (grep for `tool_kwargs["vector_db_ids"]` in `_execute_tool_call` to see what I mean). ## Test Plan I stubbed in some new tests to exercise this using text and pdf documents. Note that this is currently under tests/verification only because it sometimes flakes with tool calling of the small Llama-3.2-3B model we run in CI (and that I use as an example below). We'd want to make the test a bit more robust in some way if we moved this over to tests/integration and ran it in CI. ### OpenAI SaaS (to verify test correctness) ``` pytest -sv tests/verifications/openai_api/test_responses.py \ -k 'file_search' \ --base-url=https://api.openai.com/v1 \ --model=gpt-4o ``` ### Fireworks with faiss vector store ``` llama stack run llama_stack/templates/fireworks/run.yaml pytest -sv tests/verifications/openai_api/test_responses.py \ -k 'file_search' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model=meta-llama/Llama-3.3-70B-Instruct ``` ### Ollama with faiss vector store This sometimes flakes on Ollama because the quantized small model doesn't always choose to call the tool to answer the user's question. But, it often works. ``` ollama run llama3.2:3b INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \ llama stack run ./llama_stack/templates/ollama/run.yaml \ --image-type venv \ --env OLLAMA_URL="http://0.0.0.0:11434" pytest -sv tests/verifications/openai_api/test_responses.py \ -k'file_search' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model=meta-llama/Llama-3.2-3B-Instruct ``` ### OpenAI provider with sqlite-vec vector store ``` llama stack run ./llama_stack/templates/starter/run.yaml --image-type venv pytest -sv tests/verifications/openai_api/test_responses.py \ -k 'file_search' \ --base-url=http://localhost:8321/v1/openai/v1 \ --model=openai/gpt-4o-mini ``` ### Ensure existing vector store integration tests still pass ``` ollama run llama3.2:3b INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \ llama stack run ./llama_stack/templates/ollama/run.yaml \ --image-type venv \ --env OLLAMA_URL="http://0.0.0.0:11434" LLAMA_STACK_CONFIG=http://localhost:8321 \ pytest -sv tests/integration/vector_io \ --text-model "meta-llama/Llama-3.2-3B-Instruct" \ --embedding-model=all-MiniLM-L6-v2 ``` --------- Signed-off-by: Ben Browning <bbrownin@redhat.com>
463 lines
15 KiB
Python
463 lines
15 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
# the root directory of this source tree.
|
|
|
|
from typing import Annotated, Any, Literal
|
|
|
|
from pydantic import BaseModel, Field
|
|
from typing_extensions import TypedDict
|
|
|
|
from llama_stack.schema_utils import json_schema_type, register_schema
|
|
|
|
# NOTE(ashwin): this file is literally a copy of the OpenAI responses API schema. We should probably
|
|
# take their YAML and generate this file automatically. Their YAML is available.
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseError(BaseModel):
|
|
code: str
|
|
message: str
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseInputMessageContentText(BaseModel):
|
|
text: str
|
|
type: Literal["input_text"] = "input_text"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseInputMessageContentImage(BaseModel):
|
|
detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
|
|
type: Literal["input_image"] = "input_image"
|
|
# TODO: handle file_id
|
|
image_url: str | None = None
|
|
|
|
|
|
# TODO: handle file content types
|
|
OpenAIResponseInputMessageContent = Annotated[
|
|
OpenAIResponseInputMessageContentText | OpenAIResponseInputMessageContentImage,
|
|
Field(discriminator="type"),
|
|
]
|
|
register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseOutputMessageContentOutputText(BaseModel):
|
|
text: str
|
|
type: Literal["output_text"] = "output_text"
|
|
|
|
|
|
OpenAIResponseOutputMessageContent = Annotated[
|
|
OpenAIResponseOutputMessageContentOutputText,
|
|
Field(discriminator="type"),
|
|
]
|
|
register_schema(OpenAIResponseOutputMessageContent, name="OpenAIResponseOutputMessageContent")
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseMessage(BaseModel):
|
|
"""
|
|
Corresponds to the various Message types in the Responses API.
|
|
They are all under one type because the Responses API gives them all
|
|
the same "type" value, and there is no way to tell them apart in certain
|
|
scenarios.
|
|
"""
|
|
|
|
content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]
|
|
role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
|
|
type: Literal["message"] = "message"
|
|
|
|
# The fields below are not used in all scenarios, but are required in others.
|
|
id: str | None = None
|
|
status: str | None = None
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
|
|
id: str
|
|
status: str
|
|
type: Literal["web_search_call"] = "web_search_call"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
|
|
id: str
|
|
queries: list[str]
|
|
status: str
|
|
type: Literal["file_search_call"] = "file_search_call"
|
|
results: list[dict[str, Any]] | None = None
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
|
|
call_id: str
|
|
name: str
|
|
arguments: str
|
|
type: Literal["function_call"] = "function_call"
|
|
id: str | None = None
|
|
status: str | None = None
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseOutputMessageMCPCall(BaseModel):
|
|
id: str
|
|
type: Literal["mcp_call"] = "mcp_call"
|
|
arguments: str
|
|
name: str
|
|
server_label: str
|
|
error: str | None = None
|
|
output: str | None = None
|
|
|
|
|
|
class MCPListToolsTool(BaseModel):
|
|
input_schema: dict[str, Any]
|
|
name: str
|
|
description: str | None = None
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseOutputMessageMCPListTools(BaseModel):
|
|
id: str
|
|
type: Literal["mcp_list_tools"] = "mcp_list_tools"
|
|
server_label: str
|
|
tools: list[MCPListToolsTool]
|
|
|
|
|
|
OpenAIResponseOutput = Annotated[
|
|
OpenAIResponseMessage
|
|
| OpenAIResponseOutputMessageWebSearchToolCall
|
|
| OpenAIResponseOutputMessageFileSearchToolCall
|
|
| OpenAIResponseOutputMessageFunctionToolCall
|
|
| OpenAIResponseOutputMessageMCPCall
|
|
| OpenAIResponseOutputMessageMCPListTools,
|
|
Field(discriminator="type"),
|
|
]
|
|
register_schema(OpenAIResponseOutput, name="OpenAIResponseOutput")
|
|
|
|
|
|
# This has to be a TypedDict because we need a "schema" field and our strong
|
|
# typing code in the schema generator doesn't support Pydantic aliases. That also
|
|
# means we can't use a discriminator field here, because TypedDicts don't support
|
|
# default values which the strong typing code requires for discriminators.
|
|
class OpenAIResponseTextFormat(TypedDict, total=False):
|
|
"""Configuration for Responses API text format.
|
|
|
|
:param type: Must be "text", "json_schema", or "json_object" to identify the format type
|
|
:param name: The name of the response format. Only used for json_schema.
|
|
:param schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model. Only used for json_schema.
|
|
:param description: (Optional) A description of the response format. Only used for json_schema.
|
|
:param strict: (Optional) Whether to strictly enforce the JSON schema. If true, the response must match the schema exactly. Only used for json_schema.
|
|
"""
|
|
|
|
type: Literal["text"] | Literal["json_schema"] | Literal["json_object"]
|
|
name: str | None
|
|
schema: dict[str, Any] | None
|
|
description: str | None
|
|
strict: bool | None
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseText(BaseModel):
|
|
format: OpenAIResponseTextFormat | None = None
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObject(BaseModel):
|
|
created_at: int
|
|
error: OpenAIResponseError | None = None
|
|
id: str
|
|
model: str
|
|
object: Literal["response"] = "response"
|
|
output: list[OpenAIResponseOutput]
|
|
parallel_tool_calls: bool = False
|
|
previous_response_id: str | None = None
|
|
status: str
|
|
temperature: float | None = None
|
|
# Default to text format to avoid breaking the loading of old responses
|
|
# before the field was added. New responses will have this set always.
|
|
text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
|
|
top_p: float | None = None
|
|
truncation: str | None = None
|
|
user: str | None = None
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseCreated(BaseModel):
|
|
response: OpenAIResponseObject
|
|
type: Literal["response.created"] = "response.created"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
|
|
response: OpenAIResponseObject
|
|
type: Literal["response.completed"] = "response.completed"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseOutputItemAdded(BaseModel):
|
|
response_id: str
|
|
item: OpenAIResponseOutput
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.output_item.added"] = "response.output_item.added"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseOutputItemDone(BaseModel):
|
|
response_id: str
|
|
item: OpenAIResponseOutput
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.output_item.done"] = "response.output_item.done"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
|
|
content_index: int
|
|
delta: str
|
|
item_id: str
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.output_text.delta"] = "response.output_text.delta"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel):
|
|
content_index: int
|
|
text: str # final text of the output item
|
|
item_id: str
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.output_text.done"] = "response.output_text.done"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta(BaseModel):
|
|
delta: str
|
|
item_id: str
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.function_call_arguments.delta"] = "response.function_call_arguments.delta"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone(BaseModel):
|
|
arguments: str # final arguments of the function call
|
|
item_id: str
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.function_call_arguments.done"] = "response.function_call_arguments.done"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseWebSearchCallInProgress(BaseModel):
|
|
item_id: str
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.web_search_call.in_progress"] = "response.web_search_call.in_progress"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseWebSearchCallSearching(BaseModel):
|
|
item_id: str
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.web_search_call.searching"] = "response.web_search_call.searching"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseWebSearchCallCompleted(BaseModel):
|
|
item_id: str
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.web_search_call.completed"] = "response.web_search_call.completed"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseMcpListToolsInProgress(BaseModel):
|
|
sequence_number: int
|
|
type: Literal["response.mcp_list_tools.in_progress"] = "response.mcp_list_tools.in_progress"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseMcpListToolsFailed(BaseModel):
|
|
sequence_number: int
|
|
type: Literal["response.mcp_list_tools.failed"] = "response.mcp_list_tools.failed"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseMcpListToolsCompleted(BaseModel):
|
|
sequence_number: int
|
|
type: Literal["response.mcp_list_tools.completed"] = "response.mcp_list_tools.completed"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta(BaseModel):
|
|
delta: str
|
|
item_id: str
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.mcp_call.arguments.delta"] = "response.mcp_call.arguments.delta"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseMcpCallArgumentsDone(BaseModel):
|
|
arguments: str # final arguments of the MCP call
|
|
item_id: str
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.mcp_call.arguments.done"] = "response.mcp_call.arguments.done"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseMcpCallInProgress(BaseModel):
|
|
item_id: str
|
|
output_index: int
|
|
sequence_number: int
|
|
type: Literal["response.mcp_call.in_progress"] = "response.mcp_call.in_progress"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseMcpCallFailed(BaseModel):
|
|
sequence_number: int
|
|
type: Literal["response.mcp_call.failed"] = "response.mcp_call.failed"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
|
|
sequence_number: int
|
|
type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
|
|
|
|
|
|
OpenAIResponseObjectStream = Annotated[
|
|
OpenAIResponseObjectStreamResponseCreated
|
|
| OpenAIResponseObjectStreamResponseOutputItemAdded
|
|
| OpenAIResponseObjectStreamResponseOutputItemDone
|
|
| OpenAIResponseObjectStreamResponseOutputTextDelta
|
|
| OpenAIResponseObjectStreamResponseOutputTextDone
|
|
| OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta
|
|
| OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone
|
|
| OpenAIResponseObjectStreamResponseWebSearchCallInProgress
|
|
| OpenAIResponseObjectStreamResponseWebSearchCallSearching
|
|
| OpenAIResponseObjectStreamResponseWebSearchCallCompleted
|
|
| OpenAIResponseObjectStreamResponseMcpListToolsInProgress
|
|
| OpenAIResponseObjectStreamResponseMcpListToolsFailed
|
|
| OpenAIResponseObjectStreamResponseMcpListToolsCompleted
|
|
| OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta
|
|
| OpenAIResponseObjectStreamResponseMcpCallArgumentsDone
|
|
| OpenAIResponseObjectStreamResponseMcpCallInProgress
|
|
| OpenAIResponseObjectStreamResponseMcpCallFailed
|
|
| OpenAIResponseObjectStreamResponseMcpCallCompleted
|
|
| OpenAIResponseObjectStreamResponseCompleted,
|
|
Field(discriminator="type"),
|
|
]
|
|
register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream")
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
|
|
"""
|
|
This represents the output of a function call that gets passed back to the model.
|
|
"""
|
|
|
|
call_id: str
|
|
output: str
|
|
type: Literal["function_call_output"] = "function_call_output"
|
|
id: str | None = None
|
|
status: str | None = None
|
|
|
|
|
|
OpenAIResponseInput = Annotated[
|
|
# Responses API allows output messages to be passed in as input
|
|
OpenAIResponseOutputMessageWebSearchToolCall
|
|
| OpenAIResponseOutputMessageFileSearchToolCall
|
|
| OpenAIResponseOutputMessageFunctionToolCall
|
|
| OpenAIResponseInputFunctionToolCallOutput
|
|
|
|
|
# Fallback to the generic message type as a last resort
|
|
OpenAIResponseMessage,
|
|
Field(union_mode="left_to_right"),
|
|
]
|
|
register_schema(OpenAIResponseInput, name="OpenAIResponseInput")
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseInputToolWebSearch(BaseModel):
|
|
type: Literal["web_search"] | Literal["web_search_preview_2025_03_11"] = "web_search"
|
|
# TODO: actually use search_context_size somewhere...
|
|
search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
|
|
# TODO: add user_location
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseInputToolFunction(BaseModel):
|
|
type: Literal["function"] = "function"
|
|
name: str
|
|
description: str | None = None
|
|
parameters: dict[str, Any] | None
|
|
strict: bool | None = None
|
|
|
|
|
|
class FileSearchRankingOptions(BaseModel):
|
|
ranker: str | None = None
|
|
score_threshold: float | None = Field(default=0.0, ge=0.0, le=1.0)
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseInputToolFileSearch(BaseModel):
|
|
type: Literal["file_search"] = "file_search"
|
|
vector_store_ids: list[str]
|
|
filters: dict[str, Any] | None = None
|
|
max_num_results: int | None = Field(default=10, ge=1, le=50)
|
|
ranking_options: FileSearchRankingOptions | None = None
|
|
|
|
|
|
class ApprovalFilter(BaseModel):
|
|
always: list[str] | None = None
|
|
never: list[str] | None = None
|
|
|
|
|
|
class AllowedToolsFilter(BaseModel):
|
|
tool_names: list[str] | None = None
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseInputToolMCP(BaseModel):
|
|
type: Literal["mcp"] = "mcp"
|
|
server_label: str
|
|
server_url: str
|
|
headers: dict[str, Any] | None = None
|
|
|
|
require_approval: Literal["always"] | Literal["never"] | ApprovalFilter = "never"
|
|
allowed_tools: list[str] | AllowedToolsFilter | None = None
|
|
|
|
|
|
OpenAIResponseInputTool = Annotated[
|
|
OpenAIResponseInputToolWebSearch
|
|
| OpenAIResponseInputToolFileSearch
|
|
| OpenAIResponseInputToolFunction
|
|
| OpenAIResponseInputToolMCP,
|
|
Field(discriminator="type"),
|
|
]
|
|
register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")
|
|
|
|
|
|
class ListOpenAIResponseInputItem(BaseModel):
|
|
data: list[OpenAIResponseInput]
|
|
object: Literal["list"] = "list"
|
|
|
|
|
|
@json_schema_type
|
|
class OpenAIResponseObjectWithInput(OpenAIResponseObject):
|
|
input: list[OpenAIResponseInput]
|
|
|
|
|
|
@json_schema_type
|
|
class ListOpenAIResponseObject(BaseModel):
|
|
data: list[OpenAIResponseObjectWithInput]
|
|
has_more: bool
|
|
first_id: str
|
|
last_id: str
|
|
object: Literal["list"] = "list"
|