Merge branch 'main' into fix-tool-call-args

This commit is contained in:
Ashwin Bharambe 2025-09-30 14:59:22 -07:00 committed by GitHub
commit cbc1b6889e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
89 changed files with 14920 additions and 2301 deletions

View file

@ -237,6 +237,7 @@ class OpenAIResponsesImpl:
response_tools=tools,
temperature=temperature,
response_format=response_format,
inputs=input,
)
# Create orchestrator and delegate streaming logic

View file

@ -10,10 +10,12 @@ from typing import Any
from llama_stack.apis.agents.openai_responses import (
AllowedToolsFilter,
ApprovalFilter,
MCPListToolsTool,
OpenAIResponseContentPartOutputText,
OpenAIResponseInputTool,
OpenAIResponseInputToolMCP,
OpenAIResponseMCPApprovalRequest,
OpenAIResponseObject,
OpenAIResponseObjectStream,
OpenAIResponseObjectStreamResponseCompleted,
@ -127,13 +129,16 @@ class StreamingResponseOrchestrator:
messages = self.ctx.messages.copy()
while True:
# Text is the default response format for chat completion so don't need to pass it
# (some providers don't support non-empty response_format when tools are present)
response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format
completion_result = await self.inference_api.openai_chat_completion(
model=self.ctx.model,
messages=messages,
tools=self.ctx.chat_tools,
stream=True,
temperature=self.ctx.temperature,
response_format=self.ctx.response_format,
response_format=response_format,
)
# Process streaming chunks and build complete response
@ -147,10 +152,17 @@ class StreamingResponseOrchestrator:
raise ValueError("Streaming chunk processor failed to return completion data")
current_response = self._build_chat_completion(completion_result_data)
function_tool_calls, non_function_tool_calls, next_turn_messages = self._separate_tool_calls(
function_tool_calls, non_function_tool_calls, approvals, next_turn_messages = self._separate_tool_calls(
current_response, messages
)
# add any approval requests required
for tool_call in approvals:
async for evt in self._add_mcp_approval_request(
tool_call.function.name, tool_call.function.arguments, output_messages
):
yield evt
# Handle choices with no tool calls
for choice in current_response.choices:
if not (choice.message.tool_calls and self.ctx.response_tools):
@ -194,10 +206,11 @@ class StreamingResponseOrchestrator:
# Emit response.completed
yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
def _separate_tool_calls(self, current_response, messages) -> tuple[list, list, list]:
def _separate_tool_calls(self, current_response, messages) -> tuple[list, list, list, list]:
"""Separate tool calls into function and non-function categories."""
function_tool_calls = []
non_function_tool_calls = []
approvals = []
next_turn_messages = messages.copy()
for choice in current_response.choices:
@ -208,9 +221,23 @@ class StreamingResponseOrchestrator:
if is_function_tool_call(tool_call, self.ctx.response_tools):
function_tool_calls.append(tool_call)
else:
non_function_tool_calls.append(tool_call)
if self._approval_required(tool_call.function.name):
approval_response = self.ctx.approval_response(
tool_call.function.name, tool_call.function.arguments
)
if approval_response:
if approval_response.approve:
logger.info(f"Approval granted for {tool_call.id} on {tool_call.function.name}")
non_function_tool_calls.append(tool_call)
else:
logger.info(f"Approval denied for {tool_call.id} on {tool_call.function.name}")
else:
logger.info(f"Requesting approval for {tool_call.id} on {tool_call.function.name}")
approvals.append(tool_call)
else:
non_function_tool_calls.append(tool_call)
return function_tool_calls, non_function_tool_calls, next_turn_messages
return function_tool_calls, non_function_tool_calls, approvals, next_turn_messages
async def _process_streaming_chunks(
self, completion_result, output_messages: list[OpenAIResponseOutput]
@ -649,3 +676,46 @@ class StreamingResponseOrchestrator:
# TODO: Emit mcp_list_tools.failed event if needed
logger.exception(f"Failed to list MCP tools from {mcp_tool.server_url}: {e}")
raise
def _approval_required(self, tool_name: str) -> bool:
if tool_name not in self.mcp_tool_to_server:
return False
mcp_server = self.mcp_tool_to_server[tool_name]
if mcp_server.require_approval == "always":
return True
if mcp_server.require_approval == "never":
return False
if isinstance(mcp_server, ApprovalFilter):
if tool_name in mcp_server.always:
return True
if tool_name in mcp_server.never:
return False
return True
async def _add_mcp_approval_request(
self, tool_name: str, arguments: str, output_messages: list[OpenAIResponseOutput]
) -> AsyncIterator[OpenAIResponseObjectStream]:
mcp_server = self.mcp_tool_to_server[tool_name]
mcp_approval_request = OpenAIResponseMCPApprovalRequest(
arguments=arguments,
id=f"approval_{uuid.uuid4()}",
name=tool_name,
server_label=mcp_server.server_label,
)
output_messages.append(mcp_approval_request)
self.sequence_number += 1
yield OpenAIResponseObjectStreamResponseOutputItemAdded(
response_id=self.response_id,
item=mcp_approval_request,
output_index=len(output_messages) - 1,
sequence_number=self.sequence_number,
)
self.sequence_number += 1
yield OpenAIResponseObjectStreamResponseOutputItemDone(
response_id=self.response_id,
item=mcp_approval_request,
output_index=len(output_messages) - 1,
sequence_number=self.sequence_number,
)

View file

@ -10,7 +10,10 @@ from openai.types.chat import ChatCompletionToolParam
from pydantic import BaseModel
from llama_stack.apis.agents.openai_responses import (
OpenAIResponseInput,
OpenAIResponseInputTool,
OpenAIResponseMCPApprovalRequest,
OpenAIResponseMCPApprovalResponse,
OpenAIResponseObjectStream,
OpenAIResponseOutput,
)
@ -58,3 +61,37 @@ class ChatCompletionContext(BaseModel):
chat_tools: list[ChatCompletionToolParam] | None = None
temperature: float | None
response_format: OpenAIResponseFormatParam
approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
def __init__(
self,
model: str,
messages: list[OpenAIMessageParam],
response_tools: list[OpenAIResponseInputTool] | None,
temperature: float | None,
response_format: OpenAIResponseFormatParam,
inputs: list[OpenAIResponseInput] | str,
):
super().__init__(
model=model,
messages=messages,
response_tools=response_tools,
temperature=temperature,
response_format=response_format,
)
if not isinstance(inputs, str):
self.approval_requests = [input for input in inputs if input.type == "mcp_approval_request"]
self.approval_responses = {
input.approval_request_id: input for input in inputs if input.type == "mcp_approval_response"
}
def approval_response(self, tool_name: str, arguments: str) -> OpenAIResponseMCPApprovalResponse | None:
request = self._approval_request(tool_name, arguments)
return self.approval_responses.get(request.id, None) if request else None
def _approval_request(self, tool_name: str, arguments: str) -> OpenAIResponseMCPApprovalRequest | None:
for request in self.approval_requests:
if request.name == tool_name and request.arguments == arguments:
return request
return None

View file

@ -13,6 +13,8 @@ from llama_stack.apis.agents.openai_responses import (
OpenAIResponseInputMessageContentImage,
OpenAIResponseInputMessageContentText,
OpenAIResponseInputTool,
OpenAIResponseMCPApprovalRequest,
OpenAIResponseMCPApprovalResponse,
OpenAIResponseMessage,
OpenAIResponseOutputMessageContent,
OpenAIResponseOutputMessageContentOutputText,
@ -149,6 +151,11 @@ async def convert_response_input_to_chat_messages(
elif isinstance(input_item, OpenAIResponseOutputMessageMCPListTools):
# the tool list will be handled separately
pass
elif isinstance(input_item, OpenAIResponseMCPApprovalRequest) or isinstance(
input_item, OpenAIResponseMCPApprovalResponse
):
# these are handled by the responses impl itself and not pass through to chat completions
pass
else:
content = await convert_response_content_to_chat_content(input_item.content)
message_type = await get_message_type_by_role(input_item.role)

View file

@ -9,7 +9,7 @@ import uuid
from pathlib import Path
from typing import Annotated
from fastapi import File, Form, Response, UploadFile
from fastapi import Depends, File, Form, Response, UploadFile
from llama_stack.apis.common.errors import ResourceNotFoundError
from llama_stack.apis.common.responses import Order
@ -23,6 +23,7 @@ from llama_stack.apis.files import (
)
from llama_stack.core.datatypes import AccessRule
from llama_stack.log import get_logger
from llama_stack.providers.utils.files.form_data import parse_expires_after
from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl
@ -87,7 +88,7 @@ class LocalfsFilesImpl(Files):
self,
file: Annotated[UploadFile, File()],
purpose: Annotated[OpenAIFilePurpose, Form()],
expires_after: Annotated[ExpiresAfter | None, Form()] = None,
expires_after: Annotated[ExpiresAfter | None, Depends(parse_expires_after)] = None,
) -> OpenAIFileObject:
"""Upload a file that can be used across various endpoints."""
if not self.sql_store:

View file

@ -290,13 +290,13 @@ class LlamaGuardShield:
else:
shield_input_message = self.build_text_shield_input(messages)
# TODO: llama-stack inference protocol has issues with non-streaming inference code
response = await self.inference_api.chat_completion(
model_id=self.model,
response = await self.inference_api.openai_chat_completion(
model=self.model,
messages=[shield_input_message],
stream=False,
temperature=0.0, # default is 1, which is too high for safety
)
content = response.completion_message.content
content = response.choices[0].message.content
content = content.strip()
return self.get_shield_response(content)

View file

@ -30,7 +30,7 @@ class TelemetryConfig(BaseModel):
description="The service name to use for telemetry",
)
sinks: list[TelemetrySink] = Field(
default=[TelemetrySink.CONSOLE, TelemetrySink.SQLITE],
default=[TelemetrySink.SQLITE],
description="List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console)",
)
sqlite_db_path: str = Field(
@ -49,7 +49,7 @@ class TelemetryConfig(BaseModel):
def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> dict[str, Any]:
return {
"service_name": "${env.OTEL_SERVICE_NAME:=\u200b}",
"sinks": "${env.TELEMETRY_SINKS:=console,sqlite}",
"sinks": "${env.TELEMETRY_SINKS:=sqlite}",
"sqlite_db_path": "${env.SQLITE_STORE_DIR:=" + __distro_dir__ + "}/" + db_name,
"otel_exporter_otlp_endpoint": "${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}",
}

View file

@ -10,7 +10,7 @@ from typing import Annotated, Any
import boto3
from botocore.exceptions import BotoCoreError, ClientError, NoCredentialsError
from fastapi import File, Form, Response, UploadFile
from fastapi import Depends, File, Form, Response, UploadFile
from llama_stack.apis.common.errors import ResourceNotFoundError
from llama_stack.apis.common.responses import Order
@ -23,6 +23,7 @@ from llama_stack.apis.files import (
OpenAIFilePurpose,
)
from llama_stack.core.datatypes import AccessRule
from llama_stack.providers.utils.files.form_data import parse_expires_after
from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl
@ -195,7 +196,7 @@ class S3FilesImpl(Files):
self,
file: Annotated[UploadFile, File()],
purpose: Annotated[OpenAIFilePurpose, Form()],
expires_after: Annotated[ExpiresAfter | None, Form()] = None,
expires_after: Annotated[ExpiresAfter | None, Depends(parse_expires_after)] = None,
) -> OpenAIFileObject:
file_id = f"file-{uuid.uuid4().hex}"

View file

@ -44,8 +44,8 @@ client.initialize()
The following example shows how to create a chat completion for an NVIDIA NIM.
```python
response = client.inference.chat_completion(
model_id="meta-llama/Llama-3.1-8B-Instruct",
response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=[
{
"role": "system",
@ -57,11 +57,9 @@ response = client.inference.chat_completion(
},
],
stream=False,
sampling_params={
"max_tokens": 50,
},
max_tokens=50,
)
print(f"Response: {response.completion_message.content}")
print(f"Response: {response.choices[0].message.content}")
```
### Tool Calling Example ###
@ -89,15 +87,15 @@ tool_definition = ToolDefinition(
},
)
tool_response = client.inference.chat_completion(
model_id="meta-llama/Llama-3.1-8B-Instruct",
tool_response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=[tool_definition],
)
print(f"Tool Response: {tool_response.completion_message.content}")
if tool_response.completion_message.tool_calls:
for tool_call in tool_response.completion_message.tool_calls:
print(f"Tool Response: {tool_response.choices[0].message.content}")
if tool_response.choices[0].message.tool_calls:
for tool_call in tool_response.choices[0].message.tool_calls:
print(f"Tool Called: {tool_call.tool_name}")
print(f"Arguments: {tool_call.arguments}")
```
@ -123,8 +121,8 @@ response_format = JsonSchemaResponseFormat(
type=ResponseFormatType.json_schema, json_schema=person_schema
)
structured_response = client.inference.chat_completion(
model_id="meta-llama/Llama-3.1-8B-Instruct",
structured_response = client.chat.completions.create(
model="meta-llama/Llama-3.1-8B-Instruct",
messages=[
{
"role": "user",
@ -134,7 +132,7 @@ structured_response = client.inference.chat_completion(
response_format=response_format,
)
print(f"Structured Response: {structured_response.completion_message.content}")
print(f"Structured Response: {structured_response.choices[0].message.content}")
```
### Create Embeddings
@ -167,8 +165,8 @@ def load_image_as_base64(image_path):
image_path = {path_to_the_image}
demo_image_b64 = load_image_as_base64(image_path)
vlm_response = client.inference.chat_completion(
model_id="nvidia/vila",
vlm_response = client.chat.completions.create(
model="nvidia/vila",
messages=[
{
"role": "user",
@ -188,5 +186,5 @@ vlm_response = client.inference.chat_completion(
],
)
print(f"VLM Response: {vlm_response.completion_message.content}")
print(f"VLM Response: {vlm_response.choices[0].message.content}")
```

View file

@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -0,0 +1,69 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import json
from fastapi import Request
from pydantic import BaseModel, ValidationError
from llama_stack.apis.files import ExpiresAfter
async def parse_pydantic_from_form[T: BaseModel](request: Request, field_name: str, model_class: type[T]) -> T | None:
"""
Generic parser to extract a Pydantic model from multipart form data.
Handles both bracket notation (field[attr1], field[attr2]) and JSON string format.
Args:
request: The FastAPI request object
field_name: The name of the field in the form data (e.g., "expires_after")
model_class: The Pydantic model class to parse into
Returns:
An instance of model_class if parsing succeeds, None otherwise
Example:
expires_after = await parse_pydantic_from_form(
request, "expires_after", ExpiresAfter
)
"""
form = await request.form()
# Check for bracket notation first (e.g., expires_after[anchor], expires_after[seconds])
bracket_data = {}
prefix = f"{field_name}["
for key in form.keys():
if key.startswith(prefix) and key.endswith("]"):
# Extract the attribute name from field_name[attr]
attr = key[len(prefix) : -1]
bracket_data[attr] = form[key]
if bracket_data:
try:
return model_class(**bracket_data)
except (ValidationError, TypeError):
pass
# Check for JSON string format
if field_name in form:
value = form[field_name]
if isinstance(value, str):
try:
data = json.loads(value)
return model_class(**data)
except (json.JSONDecodeError, TypeError, ValidationError):
pass
return None
async def parse_expires_after(request: Request) -> ExpiresAfter | None:
"""
Dependency to parse expires_after from multipart form data.
Handles both bracket notation (expires_after[anchor], expires_after[seconds])
and JSON string format.
"""
return await parse_pydantic_from_form(request, "expires_after", ExpiresAfter)

View file

@ -24,11 +24,13 @@ from llama_stack.apis.vector_io import (
VectorStoreChunkingStrategyStatic,
VectorStoreContent,
VectorStoreDeleteResponse,
VectorStoreFileBatchObject,
VectorStoreFileContentsResponse,
VectorStoreFileCounts,
VectorStoreFileDeleteResponse,
VectorStoreFileLastError,
VectorStoreFileObject,
VectorStoreFilesListInBatchResponse,
VectorStoreFileStatus,
VectorStoreListFilesResponse,
VectorStoreListResponse,
@ -107,7 +109,11 @@ class OpenAIVectorStoreMixin(ABC):
self.openai_vector_stores.pop(store_id, None)
async def _save_openai_vector_store_file(
self, store_id: str, file_id: str, file_info: dict[str, Any], file_contents: list[dict[str, Any]]
self,
store_id: str,
file_id: str,
file_info: dict[str, Any],
file_contents: list[dict[str, Any]],
) -> None:
"""Save vector store file metadata to persistent storage."""
assert self.kvstore
@ -301,7 +307,10 @@ class OpenAIVectorStoreMixin(ABC):
all_stores = all_stores[after_index + 1 :]
if before:
before_index = next((i for i, store in enumerate(all_stores) if store["id"] == before), len(all_stores))
before_index = next(
(i for i, store in enumerate(all_stores) if store["id"] == before),
len(all_stores),
)
all_stores = all_stores[:before_index]
# Apply limit
@ -397,7 +406,9 @@ class OpenAIVectorStoreMixin(ABC):
max_num_results: int | None = 10,
ranking_options: SearchRankingOptions | None = None,
rewrite_query: bool | None = False,
search_mode: str | None = "vector", # Using str instead of Literal due to OpenAPI schema generator limitations
search_mode: (
str | None
) = "vector", # Using str instead of Literal due to OpenAPI schema generator limitations
) -> VectorStoreSearchResponsePage:
"""Search for chunks in a vector store."""
max_num_results = max_num_results or 10
@ -685,7 +696,10 @@ class OpenAIVectorStoreMixin(ABC):
file_objects = file_objects[after_index + 1 :]
if before:
before_index = next((i for i, file in enumerate(file_objects) if file.id == before), len(file_objects))
before_index = next(
(i for i, file in enumerate(file_objects) if file.id == before),
len(file_objects),
)
file_objects = file_objects[:before_index]
# Apply limit
@ -805,3 +819,42 @@ class OpenAIVectorStoreMixin(ABC):
id=file_id,
deleted=True,
)
async def openai_create_vector_store_file_batch(
self,
vector_store_id: str,
file_ids: list[str],
attributes: dict[str, Any] | None = None,
chunking_strategy: VectorStoreChunkingStrategy | None = None,
) -> VectorStoreFileBatchObject:
"""Create a vector store file batch."""
raise NotImplementedError("openai_create_vector_store_file_batch is not implemented yet")
async def openai_list_files_in_vector_store_file_batch(
self,
batch_id: str,
vector_store_id: str,
after: str | None = None,
before: str | None = None,
filter: str | None = None,
limit: int | None = 20,
order: str | None = "desc",
) -> VectorStoreFilesListInBatchResponse:
"""Returns a list of vector store files in a batch."""
raise NotImplementedError("openai_list_files_in_vector_store_file_batch is not implemented yet")
async def openai_retrieve_vector_store_file_batch(
self,
batch_id: str,
vector_store_id: str,
) -> VectorStoreFileBatchObject:
"""Retrieve a vector store file batch."""
raise NotImplementedError("openai_retrieve_vector_store_file_batch is not implemented yet")
async def openai_cancel_vector_store_file_batch(
self,
batch_id: str,
vector_store_id: str,
) -> VectorStoreFileBatchObject:
"""Cancel a vector store file batch."""
raise NotImplementedError("openai_cancel_vector_store_file_batch is not implemented yet")