Merge-related changes.

2025-12-31 07:39:59 +00:00 · 2025-04-02 19:56:44 +02:00 · 2025-04-02 19:56:44 +02:00 · 60e9f46856
commit 60e9f46856
parent d38aea33c1 66d6c2580e
456 changed files with 38636 additions and 10892 deletions
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -6,14 +6,12 @@

 import copy
 import json
-import os
 import re
 import secrets
 import string
 import uuid
-from datetime import datetime
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
-from urllib.parse import urlparse
+from datetime import datetime, timezone
+from typing import AsyncGenerator, List, Optional, Union

 import httpx

@ -59,12 +57,7 @@ from llama_stack.apis.inference import (
    UserMessage,
 )
 from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import (
-    RAGDocument,
-    ToolGroups,
-    ToolInvocationResult,
-    ToolRuntime,
-)
+from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
@ -153,7 +146,6 @@ class ChatAgent(ShieldRunnerMixin):
                    messages.append(
                        ToolResponseMessage(
                            call_id=response.call_id,
-                            tool_name=response.tool_name,
                            content=response.content,
                        )
                    )
@ -181,23 +173,29 @@ class ChatAgent(ShieldRunnerMixin):
        return messages

    async def create_and_execute_turn(self, request: AgentTurnCreateRequest) -> AsyncGenerator:
-        async with tracing.span("create_and_execute_turn") as span:
+        span = tracing.get_current_span()
+        if span:
            span.set_attribute("session_id", request.session_id)
            span.set_attribute("agent_id", self.agent_id)
            span.set_attribute("request", request.model_dump_json())
            turn_id = str(uuid.uuid4())
            span.set_attribute("turn_id", turn_id)
-            async for chunk in self._run_turn(request, turn_id):
-                yield chunk
+
+        await self._initialize_tools(request.toolgroups)
+        async for chunk in self._run_turn(request, turn_id):
+            yield chunk

    async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator:
-        async with tracing.span("resume_turn") as span:
+        span = tracing.get_current_span()
+        if span:
            span.set_attribute("agent_id", self.agent_id)
            span.set_attribute("session_id", request.session_id)
-            span.set_attribute("turn_id", request.turn_id)
            span.set_attribute("request", request.model_dump_json())
-            async for chunk in self._run_turn(request):
-                yield chunk
+            span.set_attribute("turn_id", request.turn_id)
+
+        await self._initialize_tools()
+        async for chunk in self._run_turn(request):
+            yield chunk

    async def _run_turn(
        self,
@ -218,18 +216,9 @@ class ChatAgent(ShieldRunnerMixin):
        steps = []
        messages = await self.get_messages_from_turns(turns)
        if is_resume:
-            if isinstance(request.tool_responses[0], ToolResponseMessage):
-                tool_response_messages = request.tool_responses
-                tool_responses = [
-                    ToolResponse(call_id=x.call_id, tool_name=x.tool_name, content=x.content)
-                    for x in request.tool_responses
-                ]
-            else:
-                tool_response_messages = [
-                    ToolResponseMessage(call_id=x.call_id, tool_name=x.tool_name, content=x.content)
-                    for x in request.tool_responses
-                ]
-                tool_responses = request.tool_responses
+            tool_response_messages = [
+                ToolResponseMessage(call_id=x.call_id, content=x.content) for x in request.tool_responses
+            ]
            messages.extend(tool_response_messages)
            last_turn = turns[-1]
            last_turn_messages = self.turn_to_messages(last_turn)
@ -247,12 +236,12 @@ class ChatAgent(ShieldRunnerMixin):
            in_progress_tool_call_step = await self.storage.get_in_progress_tool_call_step(
                request.session_id, request.turn_id
            )
-            now = datetime.now().astimezone().isoformat()
+            now = datetime.now(timezone.utc).isoformat()
            tool_execution_step = ToolExecutionStep(
                step_id=(in_progress_tool_call_step.step_id if in_progress_tool_call_step else str(uuid.uuid4())),
                turn_id=request.turn_id,
                tool_calls=(in_progress_tool_call_step.tool_calls if in_progress_tool_call_step else []),
-                tool_responses=tool_responses,
+                tool_responses=request.tool_responses,
                completed_at=now,
                started_at=(in_progress_tool_call_step.started_at if in_progress_tool_call_step else now),
            )
@ -272,7 +261,7 @@ class ChatAgent(ShieldRunnerMixin):
            start_time = last_turn.started_at
        else:
            messages.extend(request.messages)
-            start_time = datetime.now().astimezone().isoformat()
+            start_time = datetime.now(timezone.utc).isoformat()
            input_messages = request.messages

        output_message = None
@ -283,7 +272,6 @@ class ChatAgent(ShieldRunnerMixin):
            sampling_params=self.agent_config.sampling_params,
            stream=request.stream,
            documents=request.documents if not is_resume else None,
-            toolgroups_for_turn=request.toolgroups if not is_resume else None,
        ):
            if isinstance(chunk, CompletionMessage):
                output_message = chunk
@ -304,7 +292,7 @@ class ChatAgent(ShieldRunnerMixin):
            input_messages=input_messages,
            output_message=output_message,
            started_at=start_time,
-            completed_at=datetime.now().astimezone().isoformat(),
+            completed_at=datetime.now(timezone.utc).isoformat(),
            steps=steps,
        )
        await self.storage.add_turn_to_session(request.session_id, turn)
@ -335,7 +323,6 @@ class ChatAgent(ShieldRunnerMixin):
        sampling_params: SamplingParams,
        stream: bool = False,
        documents: Optional[List[Document]] = None,
-        toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
    ) -> AsyncGenerator:
        # Doing async generators makes downstream code much simpler and everything amenable to
        # streaming. However, it also makes things complicated here because AsyncGenerators cannot
@ -358,7 +345,6 @@ class ChatAgent(ShieldRunnerMixin):
            sampling_params,
            stream,
            documents,
-            toolgroups_for_turn,
        ):
            if isinstance(res, bool):
                return
@ -397,7 +383,7 @@ class ChatAgent(ShieldRunnerMixin):
                return

            step_id = str(uuid.uuid4())
-            shield_call_start_time = datetime.now().astimezone().isoformat()
+            shield_call_start_time = datetime.now(timezone.utc).isoformat()
            try:
                yield AgentTurnResponseStreamChunk(
                    event=AgentTurnResponseEvent(
@ -421,7 +407,7 @@ class ChatAgent(ShieldRunnerMixin):
                                turn_id=turn_id,
                                violation=e.violation,
                                started_at=shield_call_start_time,
-                                completed_at=datetime.now().astimezone().isoformat(),
+                                completed_at=datetime.now(timezone.utc).isoformat(),
                            ),
                        )
                    )
@ -444,7 +430,7 @@ class ChatAgent(ShieldRunnerMixin):
                            turn_id=turn_id,
                            violation=None,
                            started_at=shield_call_start_time,
-                            completed_at=datetime.now().astimezone().isoformat(),
+                            completed_at=datetime.now(timezone.utc).isoformat(),
                        ),
                    )
                )
@ -459,30 +445,35 @@ class ChatAgent(ShieldRunnerMixin):
        sampling_params: SamplingParams,
        stream: bool = False,
        documents: Optional[List[Document]] = None,
-        toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
    ) -> AsyncGenerator:
-        # TODO: simplify all of this code, it can be simpler
-        toolgroup_args = {}
-        toolgroups = set()
-        for toolgroup in self.agent_config.toolgroups + (toolgroups_for_turn or []):
-            if isinstance(toolgroup, AgentToolGroupWithArgs):
-                tool_group_name, tool_name = self._parse_toolgroup_name(toolgroup.name)
-                toolgroups.add(tool_group_name)
-                toolgroup_args[tool_group_name] = toolgroup.args
-            else:
-                toolgroups.add(toolgroup)
-
-        tool_defs, tool_to_group = await self._get_tool_defs(toolgroups_for_turn)
+        # if document is passed in a turn, we parse the raw text of the document
+        # and sent it as a user message
        if documents:
-            await self.handle_documents(session_id, documents, input_messages, tool_defs)
+            contexts = []
+            for document in documents:
+                raw_document_text = await get_raw_document_text(document)
+                contexts.append(raw_document_text)
+
+            attached_context = "\n".join(contexts)
+            if isinstance(input_messages[-1].content, str):
+                input_messages[-1].content += attached_context
+            elif isinstance(input_messages[-1].content, list):
+                input_messages[-1].content.append(TextContentItem(text=attached_context))
+            else:
+                input_messages[-1].content = [
+                    input_messages[-1].content,
+                    TextContentItem(text=attached_context),
+                ]

        session_info = await self.storage.get_session_info(session_id)
        # if the session has a memory bank id, let the memory tool use it
        if session_info and session_info.vector_db_id:
-            if RAG_TOOL_GROUP not in toolgroup_args:
-                toolgroup_args[RAG_TOOL_GROUP] = {"vector_db_ids": [session_info.vector_db_id]}
-            else:
-                toolgroup_args[RAG_TOOL_GROUP]["vector_db_ids"].append(session_info.vector_db_id)
+            for tool_name in self.tool_name_to_args.keys():
+                if tool_name == MEMORY_QUERY_TOOL:
+                    if "vector_db_ids" not in self.tool_name_to_args[tool_name]:
+                        self.tool_name_to_args[tool_name]["vector_db_ids"] = [session_info.vector_db_id]
+                    else:
+                        self.tool_name_to_args[tool_name]["vector_db_ids"].append(session_info.vector_db_id)

        output_attachments = []

@ -494,7 +485,7 @@ class ChatAgent(ShieldRunnerMixin):
            client_tools[tool.name] = tool
        while True:
            step_id = str(uuid.uuid4())
-            inference_start_time = datetime.now().astimezone().isoformat()
+            inference_start_time = datetime.now(timezone.utc).isoformat()
            yield AgentTurnResponseStreamChunk(
                event=AgentTurnResponseEvent(
                    payload=AgentTurnResponseStepStartPayload(
@ -512,7 +503,7 @@ class ChatAgent(ShieldRunnerMixin):
                async for chunk in await self.inference_api.chat_completion(
                    self.agent_config.model,
                    input_messages,
-                    tools=tool_defs,
+                    tools=self.tool_defs,
                    tool_prompt_format=self.agent_config.tool_config.tool_prompt_format,
                    response_format=self.agent_config.response_format,
                    stream=True,
@ -604,7 +595,7 @@ class ChatAgent(ShieldRunnerMixin):
                            turn_id=turn_id,
                            model_response=copy.deepcopy(message),
                            started_at=inference_start_time,
-                            completed_at=datetime.now().astimezone().isoformat(),
+                            completed_at=datetime.now(timezone.utc).isoformat(),
                        ),
                    )
                )
@ -636,125 +627,143 @@ class ChatAgent(ShieldRunnerMixin):
                    logger.debug(f"completion message with EOM (iter: {n_iter}): {str(message)}")
                    input_messages = input_messages + [message]
            else:
-                logger.debug(f"completion message (iter: {n_iter}) from the model: {str(message)}")
-                # 1. Start the tool execution step and progress
-                step_id = str(uuid.uuid4())
-                yield AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseStepStartPayload(
-                            step_type=StepType.tool_execution.value,
-                            step_id=step_id,
-                        )
-                    )
-                )
-                tool_call = message.tool_calls[0]
-                yield AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseStepProgressPayload(
-                            step_type=StepType.tool_execution.value,
-                            step_id=step_id,
-                            tool_call=tool_call,
-                            delta=ToolCallDelta(
-                                parse_status=ToolCallParseStatus.in_progress,
-                                tool_call=tool_call,
-                            ),
-                        )
-                    )
-                )
+                input_messages = input_messages + [message]

-                # If tool is a client tool, yield CompletionMessage and return
-                if tool_call.tool_name in client_tools:
-                    # NOTE: mark end_of_message to indicate to client that it may
-                    # call the tool and continue the conversation with the tool's response.
-                    message.stop_reason = StopReason.end_of_message
+                # Process tool calls in the message
+                client_tool_calls = []
+                non_client_tool_calls = []
+
+                # Separate client and non-client tool calls
+                for tool_call in message.tool_calls:
+                    if tool_call.tool_name in client_tools:
+                        client_tool_calls.append(tool_call)
+                    else:
+                        non_client_tool_calls.append(tool_call)
+
+                # Process non-client tool calls first
+                for tool_call in non_client_tool_calls:
+                    step_id = str(uuid.uuid4())
+                    yield AgentTurnResponseStreamChunk(
+                        event=AgentTurnResponseEvent(
+                            payload=AgentTurnResponseStepStartPayload(
+                                step_type=StepType.tool_execution.value,
+                                step_id=step_id,
+                            )
+                        )
+                    )
+
+                    yield AgentTurnResponseStreamChunk(
+                        event=AgentTurnResponseEvent(
+                            payload=AgentTurnResponseStepProgressPayload(
+                                step_type=StepType.tool_execution.value,
+                                step_id=step_id,
+                                delta=ToolCallDelta(
+                                    parse_status=ToolCallParseStatus.in_progress,
+                                    tool_call=tool_call,
+                                ),
+                            )
+                        )
+                    )
+
+                    # Execute the tool call
+                    async with tracing.span(
+                        "tool_execution",
+                        {
+                            "tool_name": tool_call.tool_name,
+                            "input": message.model_dump_json(),
+                        },
+                    ) as span:
+                        tool_execution_start_time = datetime.now(timezone.utc).isoformat()
+                        tool_result = await self.execute_tool_call_maybe(
+                            session_id,
+                            tool_call,
+                        )
+                        if tool_result.content is None:
+                            raise ValueError(
+                                f"Tool call result (id: {tool_call.call_id}, name: {tool_call.tool_name}) does not have any content"
+                            )
+                        result_message = ToolResponseMessage(
+                            call_id=tool_call.call_id,
+                            content=tool_result.content,
+                        )
+                        span.set_attribute("output", result_message.model_dump_json())
+
+                        # Store tool execution step
+                        tool_execution_step = ToolExecutionStep(
+                            step_id=step_id,
+                            turn_id=turn_id,
+                            tool_calls=[tool_call],
+                            tool_responses=[
+                                ToolResponse(
+                                    call_id=tool_call.call_id,
+                                    tool_name=tool_call.tool_name,
+                                    content=tool_result.content,
+                                    metadata=tool_result.metadata,
+                                )
+                            ],
+                            started_at=tool_execution_start_time,
+                            completed_at=datetime.now(timezone.utc).isoformat(),
+                        )
+
+                        # Yield the step completion event
+                        yield AgentTurnResponseStreamChunk(
+                            event=AgentTurnResponseEvent(
+                                payload=AgentTurnResponseStepCompletePayload(
+                                    step_type=StepType.tool_execution.value,
+                                    step_id=step_id,
+                                    step_details=tool_execution_step,
+                                )
+                            )
+                        )
+
+                        # Add the result message to input_messages for the next iteration
+                        input_messages.append(result_message)
+
+                        # TODO: add tool-input touchpoint and a "start" event for this step also
+                        # but that needs a lot more refactoring of Tool code potentially
+                        if (type(result_message.content) is str) and (
+                            out_attachment := _interpret_content_as_attachment(result_message.content)
+                        ):
+                            # NOTE: when we push this message back to the model, the model may ignore the
+                            # attached file path etc. since the model is trained to only provide a user message
+                            # with the summary. We keep all generated attachments and then attach them to final message
+                            output_attachments.append(out_attachment)
+
+                # If there are client tool calls, yield a message with only those tool calls
+                if client_tool_calls:
                    await self.storage.set_in_progress_tool_call_step(
                        session_id,
                        turn_id,
                        ToolExecutionStep(
                            step_id=step_id,
                            turn_id=turn_id,
-                            tool_calls=[tool_call],
+                            tool_calls=client_tool_calls,
                            tool_responses=[],
-                            started_at=datetime.now().astimezone().isoformat(),
+                            started_at=datetime.now(timezone.utc).isoformat(),
                        ),
                    )
-                    yield message
+
+                    # Create a copy of the message with only client tool calls
+                    client_message = message.model_copy(deep=True)
+                    client_message.tool_calls = client_tool_calls
+                    # NOTE: mark end_of_message to indicate to client that it may
+                    # call the tool and continue the conversation with the tool's response.
+                    client_message.stop_reason = StopReason.end_of_message
+
+                    # Yield the message with client tool calls
+                    yield client_message
                    return

-                # If tool is a builtin server tool, execute it
-                tool_name = tool_call.tool_name
-                if isinstance(tool_name, BuiltinTool):
-                    tool_name = tool_name.value
-                async with tracing.span(
-                    "tool_execution",
-                    {
-                        "tool_name": tool_name,
-                        "input": message.model_dump_json(),
-                    },
-                ) as span:
-                    tool_execution_start_time = datetime.now().astimezone().isoformat()
-                    tool_call = message.tool_calls[0]
-                    tool_result = await execute_tool_call_maybe(
-                        self.tool_runtime_api,
-                        session_id,
-                        tool_call,
-                        toolgroup_args,
-                        tool_to_group,
-                    )
-                    if tool_result.content is None:
-                        raise ValueError(
-                            f"Tool call result (id: {tool_call.call_id}, name: {tool_call.tool_name}) does not have any content"
-                        )
-                    result_messages = [
-                        ToolResponseMessage(
-                            call_id=tool_call.call_id,
-                            tool_name=tool_call.tool_name,
-                            content=tool_result.content,
-                        )
-                    ]
-                    assert len(result_messages) == 1, "Currently not supporting multiple messages"
-                    result_message = result_messages[0]
-                    span.set_attribute("output", result_message.model_dump_json())
+    async def _initialize_tools(
+        self,
+        toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
+    ) -> None:
+        toolgroup_to_args = {}
+        for toolgroup in (self.agent_config.toolgroups or []) + (toolgroups_for_turn or []):
+            if isinstance(toolgroup, AgentToolGroupWithArgs):
+                tool_group_name, _ = self._parse_toolgroup_name(toolgroup.name)
+                toolgroup_to_args[tool_group_name] = toolgroup.args

-                yield AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseStepCompletePayload(
-                            step_type=StepType.tool_execution.value,
-                            step_id=step_id,
-                            step_details=ToolExecutionStep(
-                                step_id=step_id,
-                                turn_id=turn_id,
-                                tool_calls=[tool_call],
-                                tool_responses=[
-                                    ToolResponse(
-                                        call_id=result_message.call_id,
-                                        tool_name=result_message.tool_name,
-                                        content=result_message.content,
-                                        metadata=tool_result.metadata,
-                                    )
-                                ],
-                                started_at=tool_execution_start_time,
-                                completed_at=datetime.now().astimezone().isoformat(),
-                            ),
-                        )
-                    )
-                )
-
-                # TODO: add tool-input touchpoint and a "start" event for this step also
-                # but that needs a lot more refactoring of Tool code potentially
-                if (type(result_message.content) is str) and (
-                    out_attachment := _interpret_content_as_attachment(result_message.content)
-                ):
-                    # NOTE: when we push this message back to the model, the model may ignore the
-                    # attached file path etc. since the model is trained to only provide a user message
-                    # with the summary. We keep all generated attachments and then attach them to final message
-                    output_attachments.append(out_attachment)
-
-                input_messages = input_messages + [message, result_message]
-
-    async def _get_tool_defs(
-        self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None
-    ) -> Tuple[List[ToolDefinition], Dict[str, str]]:
        # Determine which tools to include
        tool_groups_to_include = toolgroups_for_turn or self.agent_config.toolgroups or []
        agent_config_toolgroups = []
@ -763,8 +772,10 @@ class ChatAgent(ShieldRunnerMixin):
            if name not in agent_config_toolgroups:
                agent_config_toolgroups.append(name)

+        toolgroup_to_args = toolgroup_to_args or {}
+
        tool_name_to_def = {}
-        tool_to_group = {}
+        tool_name_to_args = {}

        for tool_def in self.agent_config.client_tools:
            if tool_name_to_def.get(tool_def.name, None):
@ -782,53 +793,38 @@ class ChatAgent(ShieldRunnerMixin):
                    for param in tool_def.parameters
                },
            )
-            tool_to_group[tool_def.name] = "__client_tools__"
        for toolgroup_name_with_maybe_tool_name in agent_config_toolgroups:
-            toolgroup_name, tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name)
+            toolgroup_name, input_tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name)
            tools = await self.tool_groups_api.list_tools(toolgroup_id=toolgroup_name)
            if not tools.data:
                available_tool_groups = ", ".join(
                    [t.identifier for t in (await self.tool_groups_api.list_tool_groups()).data]
                )
                raise ValueError(f"Toolgroup {toolgroup_name} not found, available toolgroups: {available_tool_groups}")
-            if tool_name is not None and not any(tool.identifier == tool_name for tool in tools.data):
+            if input_tool_name is not None and not any(tool.identifier == input_tool_name for tool in tools.data):
                raise ValueError(
-                    f"Tool {tool_name} not found in toolgroup {toolgroup_name}. Available tools: {', '.join([tool.identifier for tool in tools.data])}"
+                    f"Tool {input_tool_name} not found in toolgroup {toolgroup_name}. Available tools: {', '.join([tool.identifier for tool in tools.data])}"
                )

            for tool_def in tools.data:
                if toolgroup_name.startswith("builtin") and toolgroup_name != RAG_TOOL_GROUP:
-                    tool_name = tool_def.identifier
-                    built_in_type = BuiltinTool.brave_search
-                    if tool_name == "web_search":
-                        built_in_type = BuiltinTool.brave_search
+                    identifier: str | BuiltinTool | None = tool_def.identifier
+                    if identifier == "web_search":
+                        identifier = BuiltinTool.brave_search
                    else:
-                        built_in_type = BuiltinTool(tool_name)
+                        identifier = BuiltinTool(identifier)
+                else:
+                    # add if tool_name is unspecified or the tool_def identifier is the same as the tool_name
+                    if input_tool_name in (None, tool_def.identifier):
+                        identifier = tool_def.identifier
+                    else:
+                        identifier = None

-                    if tool_name_to_def.get(built_in_type, None):
-                        raise ValueError(f"Tool {built_in_type} already exists")
-
-                    tool_name_to_def[built_in_type] = ToolDefinition(
-                        tool_name=built_in_type,
-                        description=tool_def.description,
-                        parameters={
-                            param.name: ToolParamDefinition(
-                                param_type=param.parameter_type,
-                                description=param.description,
-                                required=param.required,
-                                default=param.default,
-                            )
-                            for param in tool_def.parameters
-                        },
-                    )
-                    tool_to_group[built_in_type] = tool_def.toolgroup_id
-                    continue
-
-                if tool_name_to_def.get(tool_def.identifier, None):
-                    raise ValueError(f"Tool {tool_def.identifier} already exists")
-                if tool_name in (None, tool_def.identifier):
+                if tool_name_to_def.get(identifier, None):
+                    raise ValueError(f"Tool {identifier} already exists")
+                if identifier:
                    tool_name_to_def[tool_def.identifier] = ToolDefinition(
-                        tool_name=tool_def.identifier,
+                        tool_name=identifier,
                        description=tool_def.description,
                        parameters={
                            param.name: ToolParamDefinition(
@ -840,9 +836,12 @@ class ChatAgent(ShieldRunnerMixin):
                            for param in tool_def.parameters
                        },
                    )
-                    tool_to_group[tool_def.identifier] = tool_def.toolgroup_id
+                    tool_name_to_args[tool_def.identifier] = toolgroup_to_args.get(toolgroup_name, {})

-        return list(tool_name_to_def.values()), tool_to_group
+        self.tool_defs, self.tool_name_to_args = (
+            list(tool_name_to_def.values()),
+            tool_name_to_args,
+        )

    def _parse_toolgroup_name(self, toolgroup_name_with_maybe_tool_name: str) -> tuple[str, Optional[str]]:
        """Parse a toolgroup name into its components.
@ -861,176 +860,59 @@ class ChatAgent(ShieldRunnerMixin):
            tool_group, tool_name = split_names[0], None
        return tool_group, tool_name

-    async def handle_documents(
+    async def execute_tool_call_maybe(
        self,
        session_id: str,
-        documents: List[Document],
-        input_messages: List[Message],
-        tool_defs: Dict[str, ToolDefinition],
-    ) -> None:
-        memory_tool = any(tool_def.tool_name == MEMORY_QUERY_TOOL for tool_def in tool_defs)
-        code_interpreter_tool = any(tool_def.tool_name == BuiltinTool.code_interpreter for tool_def in tool_defs)
-        content_items = []
-        url_items = []
-        pattern = re.compile("^(https?://|file://|data:)")
-        for d in documents:
-            if isinstance(d.content, URL):
-                url_items.append(d.content)
-            elif pattern.match(d.content):
-                url_items.append(URL(uri=d.content))
+        tool_call: ToolCall,
+    ) -> ToolInvocationResult:
+        tool_name = tool_call.tool_name
+        registered_tool_names = [tool_def.tool_name for tool_def in self.tool_defs]
+        if tool_name not in registered_tool_names:
+            raise ValueError(
+                f"Tool {tool_name} not found in provided tools, registered tools: {', '.join([str(x) for x in registered_tool_names])}"
+            )
+        if isinstance(tool_name, BuiltinTool):
+            if tool_name == BuiltinTool.brave_search:
+                tool_name_str = WEB_SEARCH_TOOL
            else:
-                content_items.append(d)
-
-        # Save the contents to a tempdir and use its path as a URL if code interpreter is present
-        if code_interpreter_tool:
-            for c in content_items:
-                temp_file_path = os.path.join(self.tempdir, f"{make_random_string()}.txt")
-                with open(temp_file_path, "w") as temp_file:
-                    temp_file.write(c.content)
-                url_items.append(URL(uri=f"file://{temp_file_path}"))
-
-        if memory_tool and code_interpreter_tool:
-            # if both memory and code_interpreter are available, we download the URLs
-            # and attach the data to the last message.
-            msg = await attachment_message(self.tempdir, url_items)
-            input_messages.append(msg)
-            # Since memory is present, add all the data to the memory bank
-            await self.add_to_session_vector_db(session_id, documents)
-        elif code_interpreter_tool:
-            # if only code_interpreter is available, we download the URLs to a tempdir
-            # and attach the path to them as a message to inference with the
-            # assumption that the model invokes the code_interpreter tool with the path
-            msg = await attachment_message(self.tempdir, url_items)
-            input_messages.append(msg)
-        elif memory_tool:
-            # if only memory is available, we load the data from the URLs and content items to the memory bank
-            await self.add_to_session_vector_db(session_id, documents)
+                tool_name_str = tool_name.value
        else:
-            # if no memory or code_interpreter tool is available,
-            # we try to load the data from the URLs and content items as a message to inference
-            # and add it to the last message's context
-            input_messages[-1].context = "\n".join(
-                [doc.content for doc in content_items] + await load_data_from_urls(url_items)
-            )
+            tool_name_str = tool_name

-    async def _ensure_vector_db(self, session_id: str) -> str:
-        session_info = await self.storage.get_session_info(session_id)
-        if session_info is None:
-            raise ValueError(f"Session {session_id} not found")
-
-        if session_info.vector_db_id is None:
-            vector_db_id = f"vector_db_{session_id}"
-
-            # TODO: the semantic for registration is definitely not "creation"
-            # so we need to fix it if we expect the agent to create a new vector db
-            # for each session
-            await self.vector_io_api.register_vector_db(
-                vector_db_id=vector_db_id,
-                embedding_model="all-MiniLM-L6-v2",
-            )
-            await self.storage.add_vector_db_to_session(session_id, vector_db_id)
-        else:
-            vector_db_id = session_info.vector_db_id
-
-        return vector_db_id
-
-    async def add_to_session_vector_db(self, session_id: str, data: List[Document]) -> None:
-        vector_db_id = await self._ensure_vector_db(session_id)
-        documents = [
-            RAGDocument(
-                document_id=str(uuid.uuid4()),
-                content=a.content,
-                mime_type=a.mime_type,
-                metadata={},
-            )
-            for a in data
-        ]
-        await self.tool_runtime_api.rag_tool.insert(
-            documents=documents,
-            vector_db_id=vector_db_id,
-            chunk_size_in_tokens=512,
+        logger.info(f"executing tool call: {tool_name_str} with args: {tool_call.arguments}")
+        result = await self.tool_runtime_api.invoke_tool(
+            tool_name=tool_name_str,
+            kwargs={
+                "session_id": session_id,
+                # get the arguments generated by the model and augment with toolgroup arg overrides for the agent
+                **tool_call.arguments,
+                **self.tool_name_to_args.get(tool_name_str, {}),
+            },
        )
+        logger.debug(f"tool call {tool_name_str} completed with result: {result}")
+        return result


-async def load_data_from_urls(urls: List[URL]) -> List[str]:
-    data = []
-    for url in urls:
-        uri = url.uri
-        if uri.startswith("file://"):
-            filepath = uri[len("file://") :]
-            with open(filepath, "r") as f:
-                data.append(f.read())
-        elif uri.startswith("http"):
-            async with httpx.AsyncClient() as client:
-                r = await client.get(uri)
-                resp = r.text
-                data.append(resp)
-    return data
+async def load_data_from_url(url: str) -> str:
+    if url.startswith("http"):
+        async with httpx.AsyncClient() as client:
+            r = await client.get(url)
+            resp = r.text
+            return resp
+    raise ValueError(f"Unexpected URL: {type(url)}")


-async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessage:
-    content = []
-
-    for url in urls:
-        uri = url.uri
-        if uri.startswith("file://"):
-            filepath = uri[len("file://") :]
-        elif uri.startswith("http"):
-            path = urlparse(uri).path
-            basename = os.path.basename(path)
-            filepath = f"{tempdir}/{make_random_string() + basename}"
-            logger.info(f"Downloading {url} -> {filepath}")
-
-            async with httpx.AsyncClient() as client:
-                r = await client.get(uri)
-                resp = r.text
-                with open(filepath, "w") as fp:
-                    fp.write(resp)
-        else:
-            raise ValueError(f"Unsupported URL {url}")
-
-        content.append(
-            TextContentItem(
-                text=f'# User provided a file accessible to you at "{filepath}"\nYou can use code_interpreter to load and inspect it.'
-            )
-        )
-
-    return ToolResponseMessage(
-        call_id="",
-        tool_name=BuiltinTool.code_interpreter,
-        content=content,
-    )
-
-
-async def execute_tool_call_maybe(
-    tool_runtime_api: ToolRuntime,
-    session_id: str,
-    tool_call: ToolCall,
-    toolgroup_args: Dict[str, Dict[str, Any]],
-    tool_to_group: Dict[str, str],
-) -> ToolInvocationResult:
-    name = tool_call.tool_name
-    group_name = tool_to_group.get(name, None)
-    if group_name is None:
-        raise ValueError(f"Tool {name} not found in any tool group")
-    if isinstance(name, BuiltinTool):
-        if name == BuiltinTool.brave_search:
-            name = WEB_SEARCH_TOOL
-        else:
-            name = name.value
-
-    logger.info(f"executing tool call: {name} with args: {tool_call.arguments}")
-    result = await tool_runtime_api.invoke_tool(
-        tool_name=name,
-        kwargs={
-            "session_id": session_id,
-            # get the arguments generated by the model and augment with toolgroup arg overrides for the agent
-            **tool_call.arguments,
-            **toolgroup_args.get(group_name, {}),
-        },
-    )
-    logger.info(f"tool call {name} completed with result: {result}")
-    return result
+async def get_raw_document_text(document: Document) -> str:
+    if not document.mime_type.startswith("text/"):
+        raise ValueError(f"Unexpected document mime type: {document.mime_type}")
+    if isinstance(document.content, URL):
+        return await load_data_from_url(document.content.uri)
+    elif isinstance(document.content, str):
+        return document.content
+    elif isinstance(document.content, TextContentItem):
+        return document.content.text
+    else:
+        raise ValueError(f"Unexpected document content type: {type(document.content)}")


 def _interpret_content_as_attachment(
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -172,7 +172,7 @@ class MetaReferenceAgentsImpl(Agents):
        agent_id: str,
        session_id: str,
        turn_id: str,
-        tool_responses: Union[List[ToolResponse], List[ToolResponseMessage]],
+        tool_responses: List[ToolResponse],
        stream: Optional[bool] = False,
    ) -> AsyncGenerator:
        request = AgentTurnResumeRequest(
--- a/llama_stack/providers/inline/agents/meta_reference/persistence.py
+++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py
@ -7,12 +7,15 @@
 import json
 import logging
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import List, Optional

 from pydantic import BaseModel

 from llama_stack.apis.agents import ToolExecutionStep, Turn
+from llama_stack.distribution.access_control import check_access
+from llama_stack.distribution.datatypes import AccessAttributes
+from llama_stack.distribution.request_headers import get_auth_attributes
 from llama_stack.providers.utils.kvstore import KVStore

 log = logging.getLogger(__name__)
@ -24,6 +27,7 @@ class AgentSessionInfo(BaseModel):
    # TODO: is this used anywhere?
    vector_db_id: Optional[str] = None
    started_at: datetime
+    access_attributes: Optional[AccessAttributes] = None


 class AgentPersistence:
@ -33,11 +37,18 @@ class AgentPersistence:

    async def create_session(self, name: str) -> str:
        session_id = str(uuid.uuid4())
+
+        # Get current user's auth attributes for new sessions
+        auth_attributes = get_auth_attributes()
+        access_attributes = AccessAttributes(**auth_attributes) if auth_attributes else None
+
        session_info = AgentSessionInfo(
            session_id=session_id,
            session_name=name,
-            started_at=datetime.now(),
+            started_at=datetime.now(timezone.utc),
+            access_attributes=access_attributes,
        )
+
        await self.kvstore.set(
            key=f"session:{self.agent_id}:{session_id}",
            value=session_info.model_dump_json(),
@ -51,12 +62,34 @@ class AgentPersistence:
        if not value:
            return None

-        return AgentSessionInfo(**json.loads(value))
+        session_info = AgentSessionInfo(**json.loads(value))
+
+        # Check access to session
+        if not self._check_session_access(session_info):
+            return None
+
+        return session_info
+
+    def _check_session_access(self, session_info: AgentSessionInfo) -> bool:
+        """Check if current user has access to the session."""
+        # Handle backward compatibility for old sessions without access control
+        if not hasattr(session_info, "access_attributes"):
+            return True
+
+        return check_access(session_info.session_id, session_info.access_attributes, get_auth_attributes())
+
+    async def get_session_if_accessible(self, session_id: str) -> Optional[AgentSessionInfo]:
+        """Get session info if the user has access to it. For internal use by sub-session methods."""
+        session_info = await self.get_session_info(session_id)
+        if not session_info:
+            return None
+
+        return session_info

    async def add_vector_db_to_session(self, session_id: str, vector_db_id: str):
-        session_info = await self.get_session_info(session_id)
+        session_info = await self.get_session_if_accessible(session_id)
        if session_info is None:
-            raise ValueError(f"Session {session_id} not found")
+            raise ValueError(f"Session {session_id} not found or access denied")

        session_info.vector_db_id = vector_db_id
        await self.kvstore.set(
@ -65,12 +98,18 @@ class AgentPersistence:
        )

    async def add_turn_to_session(self, session_id: str, turn: Turn):
+        if not await self.get_session_if_accessible(session_id):
+            raise ValueError(f"Session {session_id} not found or access denied")
+
        await self.kvstore.set(
            key=f"session:{self.agent_id}:{session_id}:{turn.turn_id}",
            value=turn.model_dump_json(),
        )

    async def get_session_turns(self, session_id: str) -> List[Turn]:
+        if not await self.get_session_if_accessible(session_id):
+            raise ValueError(f"Session {session_id} not found or access denied")
+
        values = await self.kvstore.range(
            start_key=f"session:{self.agent_id}:{session_id}:",
            end_key=f"session:{self.agent_id}:{session_id}:\xff\xff\xff\xff",
@ -87,6 +126,9 @@ class AgentPersistence:
        return turns

    async def get_session_turn(self, session_id: str, turn_id: str) -> Optional[Turn]:
+        if not await self.get_session_if_accessible(session_id):
+            raise ValueError(f"Session {session_id} not found or access denied")
+
        value = await self.kvstore.get(
            key=f"session:{self.agent_id}:{session_id}:{turn_id}",
        )
@ -95,24 +137,36 @@ class AgentPersistence:
        return Turn(**json.loads(value))

    async def set_in_progress_tool_call_step(self, session_id: str, turn_id: str, step: ToolExecutionStep):
+        if not await self.get_session_if_accessible(session_id):
+            raise ValueError(f"Session {session_id} not found or access denied")
+
        await self.kvstore.set(
            key=f"in_progress_tool_call_step:{self.agent_id}:{session_id}:{turn_id}",
            value=step.model_dump_json(),
        )

    async def get_in_progress_tool_call_step(self, session_id: str, turn_id: str) -> Optional[ToolExecutionStep]:
+        if not await self.get_session_if_accessible(session_id):
+            return None
+
        value = await self.kvstore.get(
            key=f"in_progress_tool_call_step:{self.agent_id}:{session_id}:{turn_id}",
        )
        return ToolExecutionStep(**json.loads(value)) if value else None

    async def set_num_infer_iters_in_turn(self, session_id: str, turn_id: str, num_infer_iters: int):
+        if not await self.get_session_if_accessible(session_id):
+            raise ValueError(f"Session {session_id} not found or access denied")
+
        await self.kvstore.set(
            key=f"num_infer_iters_in_turn:{self.agent_id}:{session_id}:{turn_id}",
            value=str(num_infer_iters),
        )

    async def get_num_infer_iters_in_turn(self, session_id: str, turn_id: str) -> Optional[int]:
+        if not await self.get_session_if_accessible(session_id):
+            return None
+
        value = await self.kvstore.get(
            key=f"num_infer_iters_in_turn:{self.agent_id}:{session_id}:{turn_id}",
        )
--- a/llama_stack/providers/inline/datasetio/localfs/config.py
+++ b/llama_stack/providers/inline/datasetio/localfs/config.py
@ -3,9 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel

-from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 from llama_stack.providers.utils.kvstore.config import (
    KVStoreConfig,
    SqliteKVStoreConfig,
@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import (


 class LocalFSDatasetIOConfig(BaseModel):
-    kvstore: KVStoreConfig = SqliteKVStoreConfig(
-        db_path=(RUNTIME_BASE_DIR / "localfs_datasetio.db").as_posix()
-    )  # Uses SQLite config specific to localfs storage
+    kvstore: KVStoreConfig
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="localfs_datasetio.db",
+            )
+        }
--- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@ -3,20 +3,16 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import base64
-import os
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
-from urllib.parse import urlparse

 import pandas

-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Dataset
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
-from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_url
+from llama_stack.providers.utils.datasetio.pagination import paginate_records
+from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
 from llama_stack.providers.utils.kvstore import kvstore_impl

 from .config import LocalFSDatasetIOConfig
@ -24,30 +20,7 @@ from .config import LocalFSDatasetIOConfig
 DATASETS_PREFIX = "localfs_datasets:"


-class BaseDataset(ABC):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-
-    @abstractmethod
-    def __len__(self) -> int:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def __getitem__(self, idx):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def load(self):
-        raise NotImplementedError()
-
-
-@dataclass
-class DatasetInfo:
-    dataset_def: Dataset
-    dataset_impl: BaseDataset
-
-
-class PandasDataframeDataset(BaseDataset):
+class PandasDataframeDataset:
    def __init__(self, dataset_def: Dataset, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.dataset_def = dataset_def
@ -64,23 +37,19 @@ class PandasDataframeDataset(BaseDataset):
        else:
            return self.df.iloc[idx].to_dict()

-    def _validate_dataset_schema(self, df) -> pandas.DataFrame:
-        # note that we will drop any columns in dataset that are not in the schema
-        df = df[self.dataset_def.dataset_schema.keys()]
-        # check all columns in dataset schema are present
-        assert len(df.columns) == len(self.dataset_def.dataset_schema)
-        # TODO: type checking against column types in dataset schema
-        return df
-
-    def load(self) -> None:
+    async def load(self) -> None:
        if self.df is not None:
            return

-        df = get_dataframe_from_url(self.dataset_def.url)
-        if df is None:
-            raise ValueError(f"Failed to load dataset from {self.dataset_def.url}")
+        if self.dataset_def.source.type == "uri":
+            self.df = await get_dataframe_from_uri(self.dataset_def.source.uri)
+        elif self.dataset_def.source.type == "rows":
+            self.df = pandas.DataFrame(self.dataset_def.source.rows)
+        else:
+            raise ValueError(f"Unsupported dataset source type: {self.dataset_def.source.type}")

-        self.df = self._validate_dataset_schema(df)
+        if self.df is None:
+            raise ValueError(f"Failed to load dataset from {self.dataset_def.url}")


 class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
@ -99,95 +68,44 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):

        for dataset in stored_datasets:
            dataset = Dataset.model_validate_json(dataset)
-            dataset_impl = PandasDataframeDataset(dataset)
-            self.dataset_infos[dataset.identifier] = DatasetInfo(
-                dataset_def=dataset,
-                dataset_impl=dataset_impl,
-            )
+            self.dataset_infos[dataset.identifier] = dataset

    async def shutdown(self) -> None: ...

    async def register_dataset(
        self,
-        dataset: Dataset,
+        dataset_def: Dataset,
    ) -> None:
        # Store in kvstore
-        key = f"{DATASETS_PREFIX}{dataset.identifier}"
+        key = f"{DATASETS_PREFIX}{dataset_def.identifier}"
        await self.kvstore.set(
            key=key,
-            value=dataset.json(),
-        )
-        dataset_impl = PandasDataframeDataset(dataset)
-        self.dataset_infos[dataset.identifier] = DatasetInfo(
-            dataset_def=dataset,
-            dataset_impl=dataset_impl,
+            value=dataset_def.model_dump_json(),
        )
+        self.dataset_infos[dataset_def.identifier] = dataset_def

    async def unregister_dataset(self, dataset_id: str) -> None:
        key = f"{DATASETS_PREFIX}{dataset_id}"
        await self.kvstore.delete(key=key)
        del self.dataset_infos[dataset_id]

-    async def get_rows_paginated(
+    async def iterrows(
        self,
        dataset_id: str,
-        rows_in_page: int,
-        page_token: Optional[str] = None,
-        filter_condition: Optional[str] = None,
-    ) -> PaginatedRowsResult:
-        dataset_info = self.dataset_infos.get(dataset_id)
-        dataset_info.dataset_impl.load()
+        start_index: Optional[int] = None,
+        limit: Optional[int] = None,
+    ) -> PaginatedResponse:
+        dataset_def = self.dataset_infos[dataset_id]
+        dataset_impl = PandasDataframeDataset(dataset_def)
+        await dataset_impl.load()

-        if page_token and not page_token.isnumeric():
-            raise ValueError("Invalid page_token")
-
-        if page_token is None or len(page_token) == 0:
-            next_page_token = 0
-        else:
-            next_page_token = int(page_token)
-
-        start = next_page_token
-        if rows_in_page == -1:
-            end = len(dataset_info.dataset_impl)
-        else:
-            end = min(start + rows_in_page, len(dataset_info.dataset_impl))
-
-        rows = dataset_info.dataset_impl[start:end]
-
-        return PaginatedRowsResult(
-            rows=rows,
-            total_count=len(rows),
-            next_page_token=str(end),
-        )
+        records = dataset_impl.df.to_dict("records")
+        return paginate_records(records, start_index, limit)

    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
-        dataset_info = self.dataset_infos.get(dataset_id)
-        if dataset_info is None:
-            raise ValueError(f"Dataset with id {dataset_id} not found")
-
-        dataset_impl = dataset_info.dataset_impl
-        dataset_impl.load()
+        dataset_def = self.dataset_infos[dataset_id]
+        dataset_impl = PandasDataframeDataset(dataset_def)
+        await dataset_impl.load()

        new_rows_df = pandas.DataFrame(rows)
-        new_rows_df = dataset_impl._validate_dataset_schema(new_rows_df)
        dataset_impl.df = pandas.concat([dataset_impl.df, new_rows_df], ignore_index=True)
-
-        url = str(dataset_info.dataset_def.url)
-        parsed_url = urlparse(url)
-
-        if parsed_url.scheme == "file" or not parsed_url.scheme:
-            file_path = parsed_url.path
-            os.makedirs(os.path.dirname(file_path), exist_ok=True)
-            dataset_impl.df.to_csv(file_path, index=False)
-        elif parsed_url.scheme == "data":
-            # For data URLs, we need to update the base64-encoded content
-            if not parsed_url.path.startswith("text/csv;base64,"):
-                raise ValueError("Data URL must be a base64-encoded CSV")
-
-            csv_buffer = dataset_impl.df.to_csv(index=False)
-            base64_content = base64.b64encode(csv_buffer.encode("utf-8")).decode("utf-8")
-            dataset_info.dataset_def.url = URL(uri=f"data:text/csv;base64,{base64_content}")
-        else:
-            raise ValueError(
-                f"Unsupported URL scheme: {parsed_url.scheme}. Only file:// and data: URLs are supported for writing."
-            )
--- a/llama_stack/providers/inline/eval/meta_reference/config.py
+++ b/llama_stack/providers/inline/eval/meta_reference/config.py
@ -3,9 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel

-from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 from llama_stack.providers.utils.kvstore.config import (
    KVStoreConfig,
    SqliteKVStoreConfig,
@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import (


 class MetaReferenceEvalConfig(BaseModel):
-    kvstore: KVStoreConfig = SqliteKVStoreConfig(
-        db_path=(RUNTIME_BASE_DIR / "meta_reference_eval.db").as_posix()
-    )  # Uses SQLite config specific to Meta Reference Eval storage
+    kvstore: KVStoreConfig
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="meta_reference_eval.db",
+            )
+        }
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List

 from tqdm import tqdm

@ -12,22 +12,17 @@ from llama_stack.apis.agents import Agents, StepType
 from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.inference import Inference, UserMessage
+from llama_stack.apis.inference import Inference, SystemMessage, UserMessage
 from llama_stack.apis.scoring import Scoring
-from llama_stack.distribution.datatypes import Api
 from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
 from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
    MEMORY_QUERY_TOOL,
 )
-from llama_stack.providers.utils.common.data_schema_validator import (
-    ColumnName,
-    get_valid_schemas,
-    validate_dataset_schema,
-)
+from llama_stack.providers.utils.common.data_schema_validator import ColumnName
 from llama_stack.providers.utils.kvstore import kvstore_impl

-from .....apis.common.job_types import Job
-from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse, JobStatus
+from .....apis.common.job_types import Job, JobStatus
+from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse
 from .config import MetaReferenceEvalConfig

 EVAL_TASKS_PREFIX = "benchmarks:"
@ -88,15 +83,17 @@ class MetaReferenceEvalImpl(
        task_def = self.benchmarks[benchmark_id]
        dataset_id = task_def.dataset_id
        scoring_functions = task_def.scoring_functions
-        dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
-        validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.eval.value))
-        all_rows = await self.datasetio_api.get_rows_paginated(
+
+        # TODO (xiyan): validate dataset schema
+        # dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
+
+        all_rows = await self.datasetio_api.iterrows(
            dataset_id=dataset_id,
-            rows_in_page=(-1 if benchmark_config.num_examples is None else benchmark_config.num_examples),
+            limit=(-1 if benchmark_config.num_examples is None else benchmark_config.num_examples),
        )
        res = await self.evaluate_rows(
            benchmark_id=benchmark_id,
-            input_rows=all_rows.rows,
+            input_rows=all_rows.data,
            scoring_functions=scoring_functions,
            benchmark_config=benchmark_config,
        )
@ -105,7 +102,7 @@ class MetaReferenceEvalImpl(
        # need job scheduler queue (ray/celery) w/ jobs api
        job_id = str(len(self.jobs))
        self.jobs[job_id] = res
-        return Job(job_id=job_id)
+        return Job(job_id=job_id, status=JobStatus.completed)

    async def _run_agent_generation(
        self, input_rows: List[Dict[str, Any]], benchmark_config: BenchmarkConfig
@ -118,7 +115,7 @@ class MetaReferenceEvalImpl(
        for i, x in tqdm(enumerate(input_rows)):
            assert ColumnName.chat_completion_input.value in x, "Invalid input row"
            input_messages = json.loads(x[ColumnName.chat_completion_input.value])
-            input_messages = [UserMessage(**x) for x in input_messages]
+            input_messages = [UserMessage(**x) for x in input_messages if x["role"] == "user"]

            # NOTE: only single-turn agent generation is supported. Create a new session for each input row
            session_create_response = await self.agents_api.create_agent_session(agent_id, f"session-{i}")
@ -168,10 +165,11 @@ class MetaReferenceEvalImpl(
                generations.append({ColumnName.generated_answer.value: response.completion_message.content})
            elif ColumnName.chat_completion_input.value in x:
                chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value])
-                input_messages = [UserMessage(**x) for x in chat_completion_input_json]
+                input_messages = [UserMessage(**x) for x in chat_completion_input_json if x["role"] == "user"]
                messages = []
                if candidate.system_message:
                    messages.append(candidate.system_message)
+                messages += [SystemMessage(**x) for x in chat_completion_input_json if x["role"] == "system"]
                messages += input_messages
                response = await self.inference_api.chat_completion(
                    model_id=candidate.model,
@ -218,17 +216,18 @@ class MetaReferenceEvalImpl(

        return EvaluateResponse(generations=generations, scores=score_response.results)

-    async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
+    async def job_status(self, benchmark_id: str, job_id: str) -> Job:
        if job_id in self.jobs:
-            return JobStatus.completed
+            return Job(job_id=job_id, status=JobStatus.completed)

-        return None
+        raise ValueError(f"Job {job_id} not found")

    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
        raise NotImplementedError("Job cancel is not implemented yet")

    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
-        status = await self.job_status(benchmark_id, job_id)
+        job = await self.job_status(benchmark_id, job_id)
+        status = job.status
        if not status or status != JobStatus.completed:
            raise ValueError(f"Job is not completed, Status: {status.value}")

--- a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
@ -10,6 +10,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import copy
 import json
 import logging
 import multiprocessing
@ -213,7 +214,7 @@ def maybe_parse_message(maybe_json: Optional[str]) -> Optional[ProcessingMessage

 def parse_message(json_str: str) -> ProcessingMessage:
    data = json.loads(json_str)
-    return ProcessingMessageWrapper(**data).payload
+    return copy.deepcopy(ProcessingMessageWrapper(**data).payload)


 def worker_process_entrypoint(
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -43,7 +43,7 @@ class SentenceTransformersInferenceImpl(
    async def shutdown(self) -> None:
        pass

-    async def register_model(self, model: Model) -> None:
+    async def register_model(self, model: Model) -> Model:
        return model

    async def unregister_model(self, model_id: str) -> None:
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from pydantic import BaseModel, Field

 from llama_stack.schema_utils import json_schema_type
@ -40,7 +42,7 @@ class VLLMConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls):
+    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
        return {
            "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
            "max_tokens": "${env.MAX_TOKENS:4096}",
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@ -582,6 +582,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
                    tool_name=t.function.name,
                    # vLLM function args come back as a string. Llama Stack expects JSON.
                    arguments=json.loads(t.function.arguments),
+                    arguments_json=t.function.arguments,
                )
                for t in vllm_message.tool_calls
            ],
--- a/llama_stack/providers/inline/post_training/common/validator.py
+++ b/llama_stack/providers/inline/post_training/common/validator.py
@ -9,6 +9,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+
+from typing import Any
+
 from llama_stack.apis.common.type_system import (
    ChatCompletionInputType,
    DialogType,
@ -20,7 +23,7 @@ from llama_stack.providers.utils.common.data_schema_validator import (
    validate_dataset_schema,
 )

-EXPECTED_DATASET_SCHEMA = {
+EXPECTED_DATASET_SCHEMA: dict[str, list[dict[str, Any]]] = {
    "instruct": [
        {
            ColumnName.chat_completion_input.value: ChatCompletionInputType(),
@ -41,6 +44,9 @@ async def validate_input_dataset_schema(
    dataset_type: str,
 ) -> None:
    dataset_def = await datasets_api.get_dataset(dataset_id=dataset_id)
+    if not dataset_def:
+        raise ValueError(f"Dataset {dataset_id} does not exist.")
+
    if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
        raise ValueError(f"Dataset {dataset_id} does not have a schema defined.")

--- a/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
+++ b/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
@ -37,7 +37,7 @@ class TorchtuneCheckpointer:
        checkpoint_files: List[str],
        output_dir: str,
        model_type: str,
-    ) -> None:
+    ):
        # Fail fast if ``checkpoint_files`` is invalid
        # TODO: support loading more than one file
        if len(checkpoint_files) != 1:
@ -58,7 +58,7 @@ class TorchtuneCheckpointer:
        """
        Load Meta checkpoint from file. Currently only loading from a single file is supported.
        """
-        state_dict: Dict[str:Any] = {}
+        state_dict: Dict[str, Any] = {}
        model_state_dict = safe_torch_load(self._checkpoint_path)
        if self._model_type == ModelType.LLAMA3_VISION:
            from torchtune.models.llama3_2_vision._convert_weights import (
@ -85,10 +85,10 @@ class TorchtuneCheckpointer:
        state_dict: Dict[str, Any],
        epoch: int,
        adapter_only: bool = False,
-        checkpoint_format: str = "meta",
+        checkpoint_format: str | None = None,
    ) -> str:
        model_file_path = Path(self._output_dir) / f"{self._model_id}-{self._training_algorithm}-{epoch}"
-        if checkpoint_format == "meta":
+        if checkpoint_format == "meta" or checkpoint_format is None:
            self._save_meta_format_checkpoint(model_file_path, state_dict, adapter_only)
        elif checkpoint_format == "huggingface":
            # Note: for saving hugging face format checkpoints, we only suppport saving adapter weights now
--- a/llama_stack/providers/inline/post_training/torchtune/common/utils.py
+++ b/llama_stack/providers/inline/post_training/torchtune/common/utils.py
@ -10,7 +10,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Callable, Dict
+from typing import Callable, Dict

 import torch
 from pydantic import BaseModel
@ -25,10 +25,13 @@ from llama_stack.apis.post_training import DatasetFormat
 from llama_stack.models.llama.datatypes import Model
 from llama_stack.models.llama.sku_list import resolve_model

+BuildLoraModelCallable = Callable[..., torch.nn.Module]
+BuildTokenizerCallable = Callable[..., Llama3Tokenizer]
+

 class ModelConfig(BaseModel):
-    model_definition: Any
-    tokenizer_type: Any
+    model_definition: BuildLoraModelCallable
+    tokenizer_type: BuildTokenizerCallable
    checkpoint_type: str


@ -51,10 +54,6 @@ DATA_FORMATS: Dict[str, Transform] = {
 }


-BuildLoraModelCallable = Callable[..., torch.nn.Module]
-BuildTokenizerCallable = Callable[..., Llama3Tokenizer]
-
-
 def _validate_model_id(model_id: str) -> Model:
    model = resolve_model(model_id)
    if model is None or model.core_model_id.value not in MODEL_CONFIGS:
--- a/llama_stack/providers/inline/post_training/torchtune/config.py
+++ b/llama_stack/providers/inline/post_training/torchtune/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Literal, Optional
+from typing import Any, Dict, Literal, Optional

 from pydantic import BaseModel

@ -12,3 +12,9 @@ from pydantic import BaseModel
 class TorchtunePostTrainingConfig(BaseModel):
    torch_seed: Optional[int] = None
    checkpoint_format: Optional[Literal["meta", "huggingface"]] = "meta"
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "checkpoint_format": "meta",
+        }
--- a/llama_stack/providers/inline/post_training/torchtune/datasets/sft.py
+++ b/llama_stack/providers/inline/post_training/torchtune/datasets/sft.py
@ -55,7 +55,7 @@ class SFTDataset(Dataset):
        if "messages" in transformed_sample:
            validate_messages(transformed_sample["messages"])

-        tokenized_dict = self._model_transform(transformed_sample)
+        tokenized_dict: dict[str, Any] = self._model_transform(transformed_sample)

        if not ("tokens" in tokenized_dict and "mask" in tokenized_dict):
            keys_str = ", ".join(tokenized_dict.keys())
--- a/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py
@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import Any, Dict, Optional

 from llama_stack.apis.datasetio import DatasetIO
@ -64,7 +64,7 @@ class TorchtunePostTrainingImpl:
        job_status_response = PostTrainingJobStatusResponse(
            job_uuid=job_uuid,
            status=JobStatus.scheduled,
-            scheduled_at=datetime.now(),
+            scheduled_at=datetime.now(timezone.utc),
        )
        self.jobs[job_uuid] = job_status_response

@ -84,7 +84,7 @@ class TorchtunePostTrainingImpl:
                )

                job_status_response.status = JobStatus.in_progress
-                job_status_response.started_at = datetime.now()
+                job_status_response.started_at = datetime.now(timezone.utc)

                await recipe.setup()
                resources_allocated, checkpoints = await recipe.train()
@ -93,7 +93,7 @@ class TorchtunePostTrainingImpl:
                job_status_response.resources_allocated = resources_allocated
                job_status_response.checkpoints = checkpoints
                job_status_response.status = JobStatus.completed
-                job_status_response.completed_at = datetime.now()
+                job_status_response.completed_at = datetime.now(timezone.utc)

            except Exception:
                job_status_response.status = JobStatus.failed
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@ -8,7 +8,7 @@ import gc
 import logging
 import os
 import time
-from datetime import datetime
+from datetime import datetime, timezone
 from functools import partial
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
@ -37,10 +37,10 @@ from llama_stack.apis.common.training_types import PostTrainingMetric
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.post_training import (
-    AlgorithmConfig,
    Checkpoint,
    LoraFinetuningConfig,
    OptimizerConfig,
+    QATFinetuningConfig,
    TrainingConfig,
 )
 from llama_stack.distribution.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
@ -73,6 +73,9 @@ class LoraFinetuningSingleDevice:

    # Currently logging only logs limited training metrics to local disk
    # will figure out more loggings and how it works with telemetry in future PRs
+
+    _checkpointer: TorchtuneCheckpointer
+
    def __init__(
        self,
        config: TorchtunePostTrainingConfig,
@ -82,7 +85,7 @@ class LoraFinetuningSingleDevice:
        logger_config: Dict[str, Any],
        model: str,
        checkpoint_dir: Optional[str],
-        algorithm_config: Optional[AlgorithmConfig],
+        algorithm_config: LoraFinetuningConfig | QATFinetuningConfig | None,
        datasetio_api: DatasetIO,
        datasets_api: Datasets,
    ) -> None:
@ -109,12 +112,12 @@ class LoraFinetuningSingleDevice:
            return str(checkpoint_dir)

        if checkpoint_dir and checkpoint_dir != "null":
-            self.checkpoint_dir = config.checkpoint_dir
+            self.checkpoint_dir = checkpoint_dir
        else:
-            model = resolve_model(self.model_id)
-            if model is None:
+            model_obj = resolve_model(self.model_id)
+            if model_obj is None:
                raise ValueError(f"{self.model_id} not found. Your model id should be in the llama models SKU list")
-            self.checkpoint_dir = model_checkpoint_dir(model)
+            self.checkpoint_dir = model_checkpoint_dir(model_obj)

        self._output_dir = str(DEFAULT_CHECKPOINT_DIR)
        self._checkpoint_format = config.checkpoint_format
@ -135,16 +138,16 @@ class LoraFinetuningSingleDevice:
        self.max_validation_steps = training_config.max_validation_steps

        self._clip_grad_norm = 1.0
-        self._enable_activation_checkpointing = (
-            (training_config.efficiency_config.enable_activation_checkpointing)
-            if training_config.efficiency_config
-            else False
-        )
-        self._enable_activation_offloading = (
-            (training_config.efficiency_config.enable_activation_offloading)
-            if training_config.efficiency_config
-            else False
-        )
+
+        self._enable_activation_checkpointing = False
+        self._enable_activation_offloading = False
+        if training_config.efficiency_config:
+            if training_config.efficiency_config.enable_activation_checkpointing:
+                self._enable_activation_checkpointing = (
+                    training_config.efficiency_config.enable_activation_checkpointing
+                )
+            if training_config.efficiency_config.enable_activation_offloading:
+                self._enable_activation_offloading = training_config.efficiency_config.enable_activation_offloading

        self.datasetio_api = datasetio_api
        self.datasets_api = datasets_api
@ -328,13 +331,13 @@ class LoraFinetuningSingleDevice:
        batch_size: int,
    ) -> Tuple[DistributedSampler, DataLoader]:
        async def fetch_rows(dataset_id: str):
-            return await self.datasetio_api.get_rows_paginated(
+            return await self.datasetio_api.iterrows(
                dataset_id=dataset_id,
-                rows_in_page=-1,
+                limit=-1,
            )

        all_rows = await fetch_rows(dataset_id)
-        rows = all_rows.rows
+        rows = all_rows.data

        await validate_input_dataset_schema(
            datasets_api=self.datasets_api,
@ -451,12 +454,12 @@ class LoraFinetuningSingleDevice:
        """
        # Initialize tokens count and running loss (for grad accumulation)
        t0 = time.perf_counter()
-        running_loss = 0
+        running_loss: float = 0.0
        num_tokens = 0

        # training artifacts
        checkpoints = []
-        memory_stats = {}
+        memory_stats: Dict[str, Any] = {}

        # self.epochs_run should be non-zero when we're resuming from a checkpoint
        for curr_epoch in range(self.epochs_run, self.total_epochs):
@ -484,7 +487,7 @@ class LoraFinetuningSingleDevice:
                # Loss is normalized by default so we multiply by the number of tokens
                # This way we can normalize by the total number of tokens if we're accumulating gradients
                current_loss = await self._loss_step(batch) * current_num_tokens
-                running_loss += current_loss
+                running_loss += current_loss.detach().item()
                current_loss.backward()

                # Step with optimizer
@ -500,7 +503,7 @@ class LoraFinetuningSingleDevice:
                    # Update the number of steps when the weights are updated
                    self.global_step += 1

-                    loss_to_log = running_loss.item() / num_tokens
+                    loss_to_log = running_loss / num_tokens

                    pbar.update(1)
                    pbar.set_description(f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}")
@ -523,7 +526,7 @@ class LoraFinetuningSingleDevice:
                    )

                    # Reset running stats for the next step
-                    running_loss = 0
+                    running_loss = 0.0
                    num_tokens = 0
                    t0 = time.perf_counter()

@ -532,7 +535,7 @@ class LoraFinetuningSingleDevice:
            checkpoint_path = await self.save_checkpoint(epoch=curr_epoch)
            checkpoint = Checkpoint(
                identifier=f"{self.model_id}-sft-{curr_epoch}",
-                created_at=datetime.now(),
+                created_at=datetime.now(timezone.utc),
                epoch=curr_epoch,
                post_training_job_id=self.job_uuid,
                path=checkpoint_path,
--- a/llama_stack/providers/inline/preprocessing/basic/basic.py
+++ b/llama_stack/providers/inline/preprocessing/basic/basic.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.
 import logging
 import re
-from typing import List, Optional
+from typing import Any, List, Optional

 import httpx

@ -39,18 +39,24 @@ class InclineBasicPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
    # this preprocessor optionally retrieves the documents and converts them into plain text
    output_types = [PreprocessingDataType.raw_text_document]

+    preprocessor_store = None
+
    URL_VALIDATION_PATTERN = re.compile("^(https?://|file://|data:)")

    def __init__(self, config: InlineBasicPreprocessorConfig) -> None:
        self.config = config

-    async def initialize(self) -> None: ...
+    async def initialize(self) -> None:
+        pass

-    async def shutdown(self) -> None: ...
+    async def shutdown(self) -> None:
+        pass

-    async def register_preprocessor(self, preprocessor: Preprocessor) -> None: ...
+    async def register_preprocessor(self, preprocessor: Preprocessor) -> None:
+        pass

-    async def unregister_preprocessor(self, preprocessor_id: str) -> None: ...
+    async def unregister_preprocessor(self, preprocessor_id: str) -> None:
+        pass

    async def do_preprocess(
        self,
@ -78,7 +84,7 @@ class InclineBasicPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
                    )
                    continue
            elif input_type == PreprocessingDataType.raw_text_document:
-                document = interleaved_content_as_str(inp.data_element_path_or_content)
+                document = interleaved_content_as_str(inp.data_element_path_or_content)  # type: ignore
            else:
                log.error(f"Unexpected preprocessor input type: {input_type}")
                continue
@ -112,7 +118,9 @@ class InclineBasicPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):

        if isinstance(preprocessor_input.data_element_path_or_content, URL):
            return PreprocessingDataType.document_uri
-        if InclineBasicPreprocessorImpl.URL_VALIDATION_PATTERN.match(preprocessor_input.data_element_path_or_content):
+        if InclineBasicPreprocessorImpl.URL_VALIDATION_PATTERN.match(
+            str(preprocessor_input.data_element_path_or_content)
+        ):
            return PreprocessingDataType.document_uri
        if preprocessor_input.data_element_format == PreprocessingDataFormat.pdf:
            return PreprocessingDataType.binary_document
@ -120,7 +128,7 @@ class InclineBasicPreprocessorImpl(Preprocessing, PreprocessorsProtocolPrivate):
        return PreprocessingDataType.raw_text_document

    @staticmethod
-    async def _fetch_document(preprocessor_input: PreprocessingDataElement) -> str | None:
+    async def _fetch_document(preprocessor_input: PreprocessingDataElement) -> Any:
        if isinstance(preprocessor_input.data_element_path_or_content, str):
            url = preprocessor_input.data_element_path_or_content
            if not InclineBasicPreprocessorImpl.URL_VALIDATION_PATTERN.match(url):
--- a/llama_stack/providers/inline/preprocessing/simple_chunking/simple_chunking.py
+++ b/llama_stack/providers/inline/preprocessing/simple_chunking/simple_chunking.py
@ -36,6 +36,8 @@ class InclineSimpleChunkingImpl(Preprocessing, PreprocessorsProtocolPrivate):
    input_types = [PreprocessingDataType.raw_text_document]
    output_types = [PreprocessingDataType.chunks]

+    preprocessor_store = None
+
    def __init__(self, config: InclineSimpleChunkingConfig) -> None:
        self.config = config

@ -59,7 +61,7 @@ class InclineSimpleChunkingImpl(Preprocessing, PreprocessorsProtocolPrivate):

        for inp in preprocessor_inputs:
            new_chunks = self.make_overlapped_chunks(
-                inp.data_element_id, inp.data_element_path_or_content, window_len, overlap_len
+                inp.data_element_id, str(inp.data_element_path_or_content), window_len, overlap_len
            )
            for i, chunk in enumerate(new_chunks):
                new_chunk_data_element = PreprocessingDataElement(
@ -79,7 +81,7 @@ class InclineSimpleChunkingImpl(Preprocessing, PreprocessorsProtocolPrivate):
    ) -> PreprocessorResponse:
        return await self.do_preprocess(preprocessor_id="", preprocessor_inputs=preprocessor_inputs)

-    def _resolve_chunk_size_params(self, options: PreprocessorOptions) -> Tuple[int, int]:
+    def _resolve_chunk_size_params(self, options: PreprocessorOptions | None) -> Tuple[int, int]:
        window_len = (options or {}).get(
            str(SimpleChunkingOptions.chunk_size_in_tokens), self.config.chunk_size_in_tokens
        )
--- a/llama_stack/providers/inline/safety/code_scanner/config.py
+++ b/llama_stack/providers/inline/safety/code_scanner/config.py
@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from pydantic import BaseModel


 class CodeScannerConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/safety/llama_guard/config.py
+++ b/llama_stack/providers/inline/safety/llama_guard/config.py
@ -4,10 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import List
+from typing import Any, Dict, List

 from pydantic import BaseModel


 class LlamaGuardConfig(BaseModel):
    excluded_categories: List[str] = []
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "excluded_categories": [],
+        }
--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@ -227,13 +227,6 @@ class LlamaGuardShield:
        if len(messages) >= 2 and (messages[0].role == Role.user.value and messages[1].role == Role.user.value):
            messages = messages[1:]

-        for i in range(1, len(messages)):
-            if messages[i].role == messages[i - 1].role:
-                for i, m in enumerate(messages):
-                    print(f"{i}: {m.role}: {m.content}")
-                raise ValueError(
-                    f"Messages must alternate between user and assistant. Message {i} has the same role as message {i - 1}"
-                )
        return messages

    async def run(self, messages: List[Message]) -> RunShieldResponse:
--- a/llama_stack/providers/inline/safety/prompt_guard/config.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/config.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 from enum import Enum
+from typing import Any, Dict

 from pydantic import BaseModel, field_validator

@ -23,3 +24,9 @@ class PromptGuardConfig(BaseModel):
        if v not in [t.value for t in PromptGuardType]:
            raise ValueError(f"Unknown prompt guard type: {v}")
        return v
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "guard_type": "injection",
+        }
--- a/llama_stack/providers/inline/scoring/basic/config.py
+++ b/llama_stack/providers/inline/scoring/basic/config.py
@ -3,7 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel


-class BasicScoringConfig(BaseModel): ...
+class BasicScoringConfig(BaseModel):
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@ -22,12 +22,25 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 )

 from .config import BasicScoringConfig
+from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
+from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
-from .scoring_fn.regex_parser_math_response_scoring_fn import RegexParserMathResponseScoringFn
+from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn
+from .scoring_fn.regex_parser_math_response_scoring_fn import (
+    RegexParserMathResponseScoringFn,
+)
 from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn
 from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn

-FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn, RegexParserMathResponseScoringFn]
+FIXED_FNS = [
+    EqualityScoringFn,
+    SubsetOfScoringFn,
+    RegexParserScoringFn,
+    RegexParserMathResponseScoringFn,
+    BFCLScoringFn,
+    IfEvalScoringFn,
+    DocVQAScoringFn,
+]


 class BasicScoringImpl(
@ -75,12 +88,12 @@ class BasicScoringImpl(
        dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
        validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.scoring.value))

-        all_rows = await self.datasetio_api.get_rows_paginated(
+        all_rows = await self.datasetio_api.iterrows(
            dataset_id=dataset_id,
-            rows_in_page=-1,
+            limit=-1,
        )
        res = await self.score(
-            input_rows=all_rows.rows,
+            input_rows=all_rows.data,
            scoring_functions=scoring_functions,
        )
        if save_results_dataset:
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import re
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from ..utils.bfcl.ast_parser import decode_ast
+from ..utils.bfcl.checker import ast_checker, is_empty_output
+from .fn_defs.bfcl import bfcl
+
+
+def postprocess(x: Dict[str, Any], test_category: str) -> Dict[str, Any]:
+    contain_func_call = False
+    error = None
+    error_type = None
+    checker_result = {}
+    try:
+        prediction = decode_ast(x["generated_answer"], x["language"]) or ""
+        contain_func_call = True
+        # if not is_function_calling_format_output(prediction):
+        if is_empty_output(prediction):
+            contain_func_call = False
+            error = "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
+            error_type = "ast_decoder:decoder_wrong_output_format"
+        else:
+            checker_result = ast_checker(
+                json.loads(x["function"]),
+                prediction,
+                json.loads(x["ground_truth"]),
+                x["language"],
+                test_category=test_category,
+                model_name="",
+            )
+    except Exception as e:
+        prediction = ""
+        error = f"Invalid syntax. Failed to decode AST. {str(e)}"
+        error_type = "ast_decoder:decoder_failed"
+    return {
+        "prediction": prediction,
+        "contain_func_call": contain_func_call,
+        "valid": checker_result.get("valid", False),
+        "error": error or checker_result.get("error", ""),
+        "error_type": error_type or checker_result.get("error_type", ""),
+    }
+
+
+def gen_valid(x: Dict[str, Any]) -> Dict[str, float]:
+    return {"valid": x["valid"]}
+
+
+def gen_relevance_acc(x: Dict[str, Any]) -> Dict[str, float]:
+    # This function serves for both relevance and irrelevance tests, which share the exact opposite logic.
+    # If `test_category` is "irrelevance", the model is expected to output no function call.
+    # No function call means either the AST decoding fails (a error message is generated) or the decoded AST does not contain any function call (such as a empty list, `[]`).
+    # If `test_category` is "relevance", the model is expected to output to a function call, and empty list doesn't count as a function call.
+    acc = not x["contain_func_call"] if "irrelevance" in x["id"] else x["contain_func_call"]
+    return {"valid": float(acc)}
+
+
+class BFCLScoringFn(RegisteredBaseScoringFn):
+    """
+    A scoring_fn for BFCL
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            bfcl.identifier: bfcl,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = "bfcl",
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        test_category = re.sub(r"_[0-9_-]+$", "", input_row["id"])
+        score_result = postprocess(input_row, test_category)
+        if test_category in {"irrelevance", "live_relevance", "live_irrelevance"}:
+            score = gen_relevance_acc(score_result)["valid"]
+        else:
+            score = gen_valid(score_result)["valid"]
+        return {
+            "score": float(score),
+        }
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
@ -0,0 +1,240 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import re
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from .fn_defs.docvqa import docvqa
+
+CONTRACTIONS = {
+    "aint": "ain't",
+    "arent": "aren't",
+    "cant": "can't",
+    "couldve": "could've",
+    "couldnt": "couldn't",
+    "couldn'tve": "couldn't've",
+    "couldnt've": "couldn't've",
+    "didnt": "didn't",
+    "doesnt": "doesn't",
+    "dont": "don't",
+    "hadnt": "hadn't",
+    "hadnt've": "hadn't've",
+    "hadn'tve": "hadn't've",
+    "hasnt": "hasn't",
+    "havent": "haven't",
+    "hed": "he'd",
+    "hed've": "he'd've",
+    "he'dve": "he'd've",
+    "hes": "he's",
+    "howd": "how'd",
+    "howll": "how'll",
+    "hows": "how's",
+    "Id've": "I'd've",
+    "I'dve": "I'd've",
+    "Im": "I'm",
+    "Ive": "I've",
+    "isnt": "isn't",
+    "itd": "it'd",
+    "itd've": "it'd've",
+    "it'dve": "it'd've",
+    "itll": "it'll",
+    "let's": "let's",
+    "maam": "ma'am",
+    "mightnt": "mightn't",
+    "mightnt've": "mightn't've",
+    "mightn'tve": "mightn't've",
+    "mightve": "might've",
+    "mustnt": "mustn't",
+    "mustve": "must've",
+    "neednt": "needn't",
+    "notve": "not've",
+    "oclock": "o'clock",
+    "oughtnt": "oughtn't",
+    "ow's'at": "'ow's'at",
+    "'ows'at": "'ow's'at",
+    "'ow'sat": "'ow's'at",
+    "shant": "shan't",
+    "shed've": "she'd've",
+    "she'dve": "she'd've",
+    "she's": "she's",
+    "shouldve": "should've",
+    "shouldnt": "shouldn't",
+    "shouldnt've": "shouldn't've",
+    "shouldn'tve": "shouldn't've",
+    "somebody'd": "somebodyd",
+    "somebodyd've": "somebody'd've",
+    "somebody'dve": "somebody'd've",
+    "somebodyll": "somebody'll",
+    "somebodys": "somebody's",
+    "someoned": "someone'd",
+    "someoned've": "someone'd've",
+    "someone'dve": "someone'd've",
+    "someonell": "someone'll",
+    "someones": "someone's",
+    "somethingd": "something'd",
+    "somethingd've": "something'd've",
+    "something'dve": "something'd've",
+    "somethingll": "something'll",
+    "thats": "that's",
+    "thered": "there'd",
+    "thered've": "there'd've",
+    "there'dve": "there'd've",
+    "therere": "there're",
+    "theres": "there's",
+    "theyd": "they'd",
+    "theyd've": "they'd've",
+    "they'dve": "they'd've",
+    "theyll": "they'll",
+    "theyre": "they're",
+    "theyve": "they've",
+    "twas": "'twas",
+    "wasnt": "wasn't",
+    "wed've": "we'd've",
+    "we'dve": "we'd've",
+    "weve": "we've",
+    "werent": "weren't",
+    "whatll": "what'll",
+    "whatre": "what're",
+    "whats": "what's",
+    "whatve": "what've",
+    "whens": "when's",
+    "whered": "where'd",
+    "wheres": "where's",
+    "whereve": "where've",
+    "whod": "who'd",
+    "whod've": "who'd've",
+    "who'dve": "who'd've",
+    "wholl": "who'll",
+    "whos": "who's",
+    "whove": "who've",
+    "whyll": "why'll",
+    "whyre": "why're",
+    "whys": "why's",
+    "wont": "won't",
+    "wouldve": "would've",
+    "wouldnt": "wouldn't",
+    "wouldnt've": "wouldn't've",
+    "wouldn'tve": "wouldn't've",
+    "yall": "y'all",
+    "yall'll": "y'all'll",
+    "y'allll": "y'all'll",
+    "yall'd've": "y'all'd've",
+    "y'alld've": "y'all'd've",
+    "y'all'dve": "y'all'd've",
+    "youd": "you'd",
+    "youd've": "you'd've",
+    "you'dve": "you'd've",
+    "youll": "you'll",
+    "youre": "you're",
+    "youve": "you've",
+    "1st": "first",
+    "2nd": "second",
+    "3rd": "third",
+}
+NUMBERS = {
+    "none": "0",
+    "zero": "0",
+    "one": "1",
+    "two": "2",
+    "three": "3",
+    "four": "4",
+    "five": "5",
+    "six": "6",
+    "seven": "7",
+    "eight": "8",
+    "nine": "9",
+    "ten": "10",
+}
+ARTICLES = [
+    "a",
+    "an",
+    "the",
+    "to",
+    "in",
+    "from",
+    "by",
+]  # Contains a bit more than just articles, but we want to get rid of these elements influencing the accuracy
+PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
+COMMA_STRIP = re.compile(r"(\d)(\,)(\d)")
+PUNCTUATION = [
+    ";",
+    r"/",
+    "[",
+    "]",
+    '"',
+    "{",
+    "}",
+    "(",
+    ")",
+    "=",
+    "+",
+    "\\",
+    "_",
+    "-",
+    ">",
+    "<",
+    "@",
+    "`",
+    ",",
+    "?",
+    "!",
+]
+
+
+def normalize_answer(s: str) -> str:
+    # process punctuation
+    for p in PUNCTUATION:
+        if (p + " " in s or " " + p in s) or (re.search(COMMA_STRIP, s) is not None):
+            s = s.replace(p, "")
+        else:
+            s = s.replace(p, " ")
+        s = PERIOD_STRIP.sub("", s, re.UNICODE)
+
+    # process digits and articles
+    temp_text = s.lower().split()
+    out_text = []
+    for word in temp_text:
+        word = NUMBERS.setdefault(word, word)
+        if word not in ARTICLES:
+            out_text.append(word)
+
+    # standardize contractions
+    for word_id, word in enumerate(out_text):
+        if word in CONTRACTIONS:
+            out_text[word_id] = CONTRACTIONS[word]
+    return " ".join(out_text)
+
+
+class DocVQAScoringFn(RegisteredBaseScoringFn):
+    """
+    docvqa basically matches the generated answer against several allowed
+    choices, but we need to normalize the answer to avoid penalizing
+    trivial differences
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            docvqa.identifier: docvqa,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = "docvqa",
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        expected_answers = json.loads(input_row["expected_answer"])
+        generated_answer = input_row["generated_answer"]
+        score = 1.0 if normalize_answer(generated_answer) in [normalize_answer(s) for s in expected_answers] else 0.0
+        return {
+            "score": score,
+        }
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+bfcl = ScoringFn(
+    identifier="basic::bfcl",
+    description="BFCL complex scoring",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="bfcl",
+    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
+)
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+docvqa = ScoringFn(
+    identifier="basic::docvqa",
+    description="DocVQA Visual Question & Answer scoring function",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="docvqa",
+    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
+)
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+ifeval = ScoringFn(
+    identifier="basic::ifeval",
+    description="Eval intruction follow capacity by checkping how many instructions can be followed in each example",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="ifeval",
+    params=BasicScoringFnParams(
+        aggregation_functions=[AggregationFunctionType.weighted_average],
+    ),
+)
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
@ -0,0 +1,80 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from .fn_defs.ifeval import (
+    ifeval,
+)
+
+
+class IfEvalScoringFn(RegisteredBaseScoringFn):
+    """
+    A scoring_fn Instruction-Following Eval (IFEval) benchmark
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            ifeval.identifier: ifeval,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = None,
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST
+
+        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
+        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
+        if scoring_params is not None:
+            fn_def.params = scoring_params
+
+        instruction_list = input_row["instruction_id_list"]
+        generated_answer = input_row["generated_answer"].strip()
+
+        is_following_list = []
+        results = dict(
+            {k + "_correct": 0.0 for k in INSTRUCTION_LIST},
+            **{k + "_total": 0.0 for k in INSTRUCTION_LIST},
+        )
+
+        for index, instruction_id in enumerate(instruction_list):
+            instruction_cls = INSTRUCTION_DICT[instruction_id]
+            instruction = instruction_cls(instruction_id)
+            results[instruction_id + "_total"] += 1.0
+            results[instruction_id.split(":")[0] + "_total"] += 1.0
+
+            clean_input_row = {k: v for k, v in input_row["kwargs"][index].items() if v is not None}
+            print(clean_input_row)
+            instruction.build_description(**clean_input_row)
+            args = instruction.get_instruction_args()
+            if args and "prompt" in args:
+                instruction.build_description(prompt=input_row["prompt"])
+
+            if generated_answer and instruction.check_following(generated_answer):
+                is_following_list.append(True)
+                results[instruction_id + "_correct"] += 1.0
+                results[instruction_id.split(":")[0] + "_correct"] += 1.0
+            else:
+                is_following_list.append(False)
+
+        if len(is_following_list) == 0:
+            return {
+                "score": 0.0,
+                "weight": 0.0,
+            }
+
+        return {
+            "score": float(sum(is_following_list)) / float(len(is_following_list)),
+            "weight": float(len(is_following_list)),
+        }
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/init.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/init.py
@ -3,10 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-from pydantic import BaseModel
-
-
-class SampleConfig(BaseModel):
-    host: str = "localhost"
-    port: int = 9999
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
@ -0,0 +1,296 @@
+# ruff: noqa
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import ast
+
+from .tree_sitter import get_parser
+
+
+def parse_java_function_call(source_code):
+    if not source_code.endswith(";"):
+        source_code += ";"  # Necessary for the parser not to register an error
+    parser = get_parser("java")
+    tree = parser.parse(bytes(source_code, "utf8"))
+    root_node = tree.root_node
+
+    if root_node.has_error:
+        raise Exception("Error parsing java the source code.")
+
+    def get_text(node):
+        """Returns the text represented by the node."""
+        return source_code[node.start_byte : node.end_byte]
+
+    def traverse_node(node, nested=False):
+        if node.type == "string_literal":
+            if nested:
+                return get_text(node)
+            # Strip surrounding quotes from string literals
+            return get_text(node)[1:-1]
+        elif node.type == "character_literal":
+            if nested:
+                return get_text(node)
+            # Strip surrounding single quotes from character literals
+            return get_text(node)[1:-1]
+        """Traverse the node to collect texts for complex structures."""
+        if node.type in [
+            "identifier",
+            "class_literal",
+            "type_identifier",
+            "method_invocation",
+        ]:
+            return get_text(node)
+        elif node.type == "array_creation_expression":
+            # Handle array creation expression specifically
+            type_node = node.child_by_field_name("type")
+            value_node = node.child_by_field_name("value")
+            type_text = traverse_node(type_node, True)
+            value_text = traverse_node(value_node, True)
+            return f"new {type_text}[]{value_text}"
+        elif node.type == "object_creation_expression":
+            # Handle object creation expression specifically
+            type_node = node.child_by_field_name("type")
+            arguments_node = node.child_by_field_name("arguments")
+            type_text = traverse_node(type_node, True)
+            if arguments_node:
+                # Process each argument carefully, avoiding unnecessary punctuation
+                argument_texts = []
+                for child in arguments_node.children:
+                    if child.type not in [
+                        ",",
+                        "(",
+                        ")",
+                    ]:  # Exclude commas and parentheses
+                        argument_text = traverse_node(child, True)
+                        argument_texts.append(argument_text)
+                arguments_text = ", ".join(argument_texts)
+                return f"new {type_text}({arguments_text})"
+            else:
+                return f"new {type_text}()"
+        elif node.type == "set":
+            # Handling sets specifically
+            items = [traverse_node(n, True) for n in node.children if n.type not in [",", "set"]]
+            return "{" + ", ".join(items) + "}"
+
+        elif node.child_count > 0:
+            return "".join(traverse_node(child, True) for child in node.children)
+        else:
+            return get_text(node)
+
+    def extract_arguments(args_node):
+        arguments = {}
+        for child in args_node.children:
+            if child.type == "assignment_expression":
+                # For named parameters
+                name_node, value_node = child.children[0], child.children[2]
+                name = get_text(name_node)
+                value = traverse_node(value_node)
+                if name in arguments:
+                    if not isinstance(arguments[name], list):
+                        arguments[name] = [arguments[name]]
+                    arguments[name].append(value)
+                else:
+                    arguments[name] = value
+                # arguments.append({'name': name, 'value': value})
+            elif child.type in ["identifier", "class_literal", "set"]:
+                # For unnamed parameters and handling sets
+                value = traverse_node(child)
+                if None in arguments:
+                    if not isinstance(arguments[None], list):
+                        arguments[None] = [arguments[None]]
+                    arguments[None].append(value)
+                else:
+                    arguments[None] = value
+        return arguments
+
+    def traverse(node):
+        if node.type == "method_invocation":
+            # Extract the function name and its arguments
+            method_name = get_text(node.child_by_field_name("name"))
+            class_name_node = node.child_by_field_name("object")
+            if class_name_node:
+                class_name = get_text(class_name_node)
+                function_name = f"{class_name}.{method_name}"
+            else:
+                function_name = method_name
+            arguments_node = node.child_by_field_name("arguments")
+            if arguments_node:
+                arguments = extract_arguments(arguments_node)
+                for key, value in arguments.items():
+                    if isinstance(value, list):
+                        raise Exception("Error: Multiple arguments with the same name are not supported.")
+                return [{function_name: arguments}]
+
+        else:
+            for child in node.children:
+                result = traverse(child)
+                if result:
+                    return result
+
+    result = traverse(root_node)
+    return result if result else {}
+
+
+def parse_javascript_function_call(source_code):
+    if not source_code.endswith(";"):
+        source_code += ";"  # Necessary for the parser not to register an error
+    parser = get_parser("javascript")
+    # Parse the source code
+    tree = parser.parse(bytes(source_code, "utf8"))
+    root_node = tree.root_node
+    if root_node.has_error:
+        raise Exception("Error js parsing the source code.")
+
+    # Function to recursively extract argument details
+    def extract_arguments(node):
+        args = {}
+        for child in node.children:
+            if child.type == "assignment_expression":
+                # Extract left (name) and right (value) parts of the assignment
+                name = child.children[0].text.decode("utf-8")
+                value = child.children[2].text.decode("utf-8")
+                if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
+                    value = value[1:-1]  # Trim the quotation marks
+                if name in args:
+                    if not isinstance(args[name], list):
+                        args[name] = [args[name]]
+                    args[name].append(value)
+                else:
+                    args[name] = value
+
+            elif child.type == "identifier" or child.type == "true":
+                # Handle non-named arguments and boolean values
+                value = child.text.decode("utf-8")
+                if None in args:
+                    if not isinstance(args[None], list):
+                        args[None] = [args[None]]
+                    args[None].append(value)
+                else:
+                    args[None] = value
+        return args
+
+    # Find the function call and extract its name and arguments
+    if root_node.type == "program":
+        for child in root_node.children:
+            if child.type == "expression_statement":
+                for sub_child in child.children:
+                    if sub_child.type == "call_expression":
+                        function_name = sub_child.children[0].text.decode("utf8")
+                        arguments_node = sub_child.children[1]
+                        parameters = extract_arguments(arguments_node)
+                        for key, value in parameters.items():
+                            if isinstance(value, list):
+                                raise Exception("Error: Multiple arguments with the same name are not supported.")
+                        result = [{function_name: parameters}]
+                        return result
+
+
+def ast_parse(input_str, language="Python"):
+    if language == "Python":
+        cleaned_input = input_str.strip("[]'")
+        parsed = ast.parse(cleaned_input, mode="eval")
+        extracted = []
+        if isinstance(parsed.body, ast.Call):
+            extracted.append(resolve_ast_call(parsed.body))
+        else:
+            for elem in parsed.body.elts:
+                extracted.append(resolve_ast_call(elem))
+        return extracted
+    elif language == "Java":
+        return parse_java_function_call(input_str[1:-1])  # Remove the [ and ] from the string
+    elif language == "JavaScript":
+        return parse_javascript_function_call(input_str[1:-1])
+    else:
+        raise NotImplementedError(f"Unsupported language: {language}")
+
+
+def resolve_ast_call(elem):
+    # Handle nested attributes for deeply nested module paths
+    func_parts = []
+    func_part = elem.func
+    while isinstance(func_part, ast.Attribute):
+        func_parts.append(func_part.attr)
+        func_part = func_part.value
+    if isinstance(func_part, ast.Name):
+        func_parts.append(func_part.id)
+    func_name = ".".join(reversed(func_parts))
+    args_dict = {}
+    # Parse when args are simply passed as an unnamed dictionary arg
+    for arg in elem.args:
+        if isinstance(arg, ast.Dict):
+            for key, value in zip(arg.keys, arg.values):
+                if isinstance(key, ast.Constant):
+                    arg_name = key.value
+                output = resolve_ast_by_type(value)
+                args_dict[arg_name] = output
+    for arg in elem.keywords:
+        output = resolve_ast_by_type(arg.value)
+        args_dict[arg.arg] = output
+    return {func_name: args_dict}
+
+
+def resolve_ast_by_type(value):
+    if isinstance(value, ast.Constant):
+        if value.value is Ellipsis:
+            output = "..."
+        else:
+            output = value.value
+    elif isinstance(value, ast.UnaryOp):
+        output = -value.operand.value
+    elif isinstance(value, ast.List):
+        output = [resolve_ast_by_type(v) for v in value.elts]
+    elif isinstance(value, ast.Dict):
+        output = {resolve_ast_by_type(k): resolve_ast_by_type(v) for k, v in zip(value.keys, value.values)}
+    elif isinstance(value, ast.NameConstant):  # Added this condition to handle boolean values
+        output = value.value
+    elif isinstance(value, ast.BinOp):  # Added this condition to handle function calls as arguments
+        output = eval(ast.unparse(value))
+    elif isinstance(value, ast.Name):
+        output = value.id
+    elif isinstance(value, ast.Call):
+        if len(value.keywords) == 0:
+            output = ast.unparse(value)
+        else:
+            output = resolve_ast_call(value)
+    elif isinstance(value, ast.Tuple):
+        output = tuple(resolve_ast_by_type(v) for v in value.elts)
+    elif isinstance(value, ast.Lambda):
+        output = eval(ast.unparse(value.body[0].value))
+    elif isinstance(value, ast.Ellipsis):
+        output = "..."
+    elif isinstance(value, ast.Subscript):
+        try:
+            output = ast.unparse(value.body[0].value)
+        except:
+            output = ast.unparse(value.value) + "[" + ast.unparse(value.slice) + "]"
+    else:
+        raise Exception(f"Unsupported AST type: {type(value)}")
+    return output
+
+
+def decode_ast(result, language="Python"):
+    func = result
+    func = func.replace("\n", "")  # remove new line characters
+    if not func.startswith("["):
+        func = "[" + func
+    if not func.endswith("]"):
+        func = func + "]"
+    decoded_output = ast_parse(func, language)
+    return decoded_output
+
+
+def decode_execute(result):
+    func = result
+    func = func.replace("\n", "")  # remove new line characters
+    if not func.startswith("["):
+        func = "[" + func
+    if not func.endswith("]"):
+        func = func + "]"
+    decode_output = ast_parse(func)
+    execution_list = []
+    for function_call in decode_output:
+        for key, value in function_call.items():
+            execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})")
+    return execution_list
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
@ -0,0 +1,989 @@
+# ruff: noqa
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import json
+import re
+import time
+from typing import Any
+
+# Comment out for now until we actually use the rest checker in evals
+# import requests  # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function.
+
+
+class NoAPIKeyError(Exception):
+    def __init__(self):
+        self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate."
+        super().__init__(self.message)
+
+
+REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2
+
+
+JAVA_TYPE_CONVERSION = {
+    "byte": int,
+    "short": int,
+    "integer": int,
+    "float": float,
+    "double": float,
+    "long": int,
+    "boolean": bool,
+    "char": str,
+    "Array": list,
+    "ArrayList": list,
+    "Set": set,
+    "HashMap": dict,
+    "Hashtable": dict,
+    "Queue": list,  # this can be `queue.Queue` as well, for simplicity we check with list
+    "Stack": list,
+    "String": str,
+    "any": str,
+}
+
+JS_TYPE_CONVERSION = {
+    "String": str,
+    "integer": int,
+    "float": float,
+    "Bigint": int,
+    "Boolean": bool,
+    "dict": dict,
+    "array": list,
+    "any": str,
+}
+
+# We switch to conditional import for the following two imports to avoid unnecessary installations.
+# User doesn't need to setup the tree-sitter packages if they are not running the test for that language.
+# from js_type_converter import js_type_converter
+# from java_type_converter import java_type_converter
+
+PYTHON_TYPE_MAPPING = {
+    "string": str,
+    "integer": int,
+    "float": float,
+    "boolean": bool,
+    "array": list,
+    "tuple": list,
+    "dict": dict,
+    "any": str,
+}
+
+# This is the list of types that we need to recursively check its values
+PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"]
+
+
+NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"]
+
+
+#### Helper functions for AST ####
+def find_description(func_descriptions, name):
+    if type(func_descriptions) == list:
+        for func_description in func_descriptions:
+            if func_description["name"] == name:
+                return func_description
+        return None
+    else:
+        # it is a dict, there is only one function
+        return func_descriptions
+
+
+def get_possible_answer_type(possible_answer: list):
+    for answer in possible_answer:
+        if answer != "":  # Optional parameter
+            return type(answer)
+    return None
+
+
+def type_checker(
+    param: str,
+    value,
+    possible_answer: list,
+    expected_type_description: str,
+    expected_type_converted,
+    nested_type_converted,
+):
+    # NOTE: This type checker only supports nested type checking for one level deep.
+    # We didn't implement recursive type checking for nested types, as it's not needed for the current use case and it's very complex.
+
+    result: Any = {
+        "valid": True,
+        "error": [],
+        "is_variable": False,
+        "error_type": "type_error:simple",
+    }
+
+    is_variable = False
+    # check for the case where a variable is used instead of a actual value.
+    # use the type in possible_answer as the expected type
+    possible_answer_type = get_possible_answer_type(possible_answer)
+    # if possible_answer only contains optional parameters, we can't determine the type
+    if possible_answer_type != None:
+        # we are being precise here.
+        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
+        if possible_answer_type != expected_type_converted:
+            is_variable = True
+
+    # value is the same type as in function description
+    if type(value) == expected_type_converted:
+        # We don't need to do recursive check for simple types
+        if nested_type_converted == None:
+            result["is_variable"] = is_variable
+            return result
+        else:
+            for possible_answer_item in possible_answer:
+                flag = True  # Each parameter should match to at least one possible answer type.
+                # Here, we assume that each item should be the same type. We could also relax it.
+                if type(possible_answer_item) == list:
+                    for value_item in value:
+                        checker_result = type_checker(
+                            param,
+                            value_item,
+                            possible_answer_item,
+                            str(nested_type_converted),
+                            nested_type_converted,
+                            None,
+                        )
+                        if not checker_result["valid"]:
+                            flag = False
+                            break
+
+                if flag:
+                    return {"valid": True, "error": [], "is_variable": is_variable}
+
+            result["valid"] = False
+            result["error"] = [
+                f"Nested type checking failed for parameter {repr(param)}. Expected outer type {expected_type_description} with inner type {str(nested_type_converted)}. Parameter value: {repr(value)}."
+            ]
+            result["error_type"] = "type_error:nested"
+
+    # value is not as expected, check for the case where a variable is used instead of a actual value
+    # use the type in possible_answer as the expected type
+    possible_answer_type = get_possible_answer_type(possible_answer)
+    # if possible_answer only contains optional parameters, we can't determine the type
+    if possible_answer_type != None:
+        # we are being precise here.
+        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
+        if type(value) == possible_answer_type:
+            result["is_variable"] = True
+            return result
+
+    result["valid"] = False
+    result["error"].append(
+        f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}."
+    )
+    result["error_type"] = "type_error:simple"
+    return result
+
+
+def standardize_string(input_string: str):
+    # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase
+    # It will also convert all the single quotes to double quotes
+    # This is used to compare the model output with the possible answers
+    # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024
+    regex_string = r"[ \,\.\/\-\_\*\^]"
+    return re.sub(regex_string, "", input_string).lower().replace("'", '"')
+
+
+def string_checker(param: str, model_output: str, possible_answer: list):
+    standardize_possible_answer = []
+    standardize_model_output = standardize_string(model_output)
+    for i in range(len(possible_answer)):
+        if type(possible_answer[i]) == str:
+            standardize_possible_answer.append(standardize_string(possible_answer[i]))
+
+    if standardize_model_output not in standardize_possible_answer:
+        return {
+            "valid": False,
+            "error": [
+                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive."
+            ],
+            "error_type": "value_error:string",
+        }
+
+    return {"valid": True, "error": []}
+
+
+def list_checker(param: str, model_output: list, possible_answer: list):
+    # Convert the tuple to a list
+
+    standardize_model_output = list(model_output)
+
+    # If the element in the list is a string, we need to standardize it
+    for i in range(len(standardize_model_output)):
+        if type(standardize_model_output[i]) == str:
+            standardize_model_output[i] = standardize_string(model_output[i])
+
+    standardize_possible_answer: Any = []
+    # We also need to standardize the possible answers
+    for i in range(len(possible_answer)):
+        standardize_possible_answer.append([])
+        for j in range(len(possible_answer[i])):
+            if type(possible_answer[i][j]) == str:
+                standardize_possible_answer[i].append(standardize_string(possible_answer[i][j]))
+            else:
+                standardize_possible_answer[i].append(possible_answer[i][j])
+
+    if standardize_model_output not in standardize_possible_answer:
+        return {
+            "valid": False,
+            "error": [
+                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}."
+            ],
+            "error_type": "value_error:list/tuple",
+        }
+
+    return {"valid": True, "error": []}
+
+
+def dict_checker(param: str, model_output: dict, possible_answers: list):
+    # This function works for simple dictionaries, but not dictionaries with nested dictionaries.
+    # The current dataset only contains simple dictionaries, so this is sufficient.
+
+    result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
+    for i in range(len(possible_answers)):
+        if possible_answers[i] == "":
+            continue
+
+        result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
+
+        flag = True
+
+        possible_answer = possible_answers[i]
+        # possible_anwer is a single dictionary
+
+        for key, value in model_output.items():
+            if key not in possible_answer:
+                result["valid"] = False
+                result["error"].append(f"Unexpected dict key parameter: '{key}'.")  # type: ignore[attr-defined]
+                result["error_type"] = "value_error:dict_key"
+                flag = False
+                break
+
+            standardize_value = value
+            # If the value is a string, we need to standardize it
+            if type(value) == str:
+                standardize_value = standardize_string(value)
+
+            # We also need to standardize the possible answers if they are string
+            standardize_possible_answer = []
+            for i in range(len(possible_answer[key])):
+                if type(possible_answer[key][i]) == str:
+                    standardize_possible_answer.append(standardize_string(possible_answer[key][i]))
+                else:
+                    standardize_possible_answer.append(possible_answer[key][i])
+
+            if standardize_value not in standardize_possible_answer:
+                result["valid"] = False
+                result["error"].append(  # type: ignore[attr-defined]
+                    f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}."
+                )
+                result["error_type"] = "value_error:dict_value"
+                flag = False
+                break
+
+        for key, value in possible_answer.items():
+            if key not in model_output and "" not in value:
+                result["valid"] = False
+                result["error"].append(f"Missing dict key parameter: '{key}'.")  # type: ignore[attr-defined]
+                result["error_type"] = "value_error:dict_key"
+                flag = False
+                break
+
+        if flag:
+            return {"valid": True, "error": []}
+
+    return result
+
+
+def list_dict_checker(param: str, model_output: list, possible_answers: list):
+    # This function takes in a list of dictionaries and checks if each dictionary is valid
+    # The order of the dictionaries in the list must match the order of the possible answers
+
+    result = {"valid": False, "error": [], "error_type": "list_dict_checker:unclear"}
+
+    for answer_index in range(len(possible_answers)):
+        flag = True  # True means so far, all dictionaries are valid
+
+        # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers
+        if len(model_output) != len(possible_answers[answer_index]):
+            result["valid"] = False
+            result["error"] = ["Wrong number of dictionaries in the list."]
+            result["error_type"] = "value_error:list_dict_count"
+            flag = False
+            continue
+
+        for dict_index in range(len(model_output)):
+            result = dict_checker(
+                param,
+                model_output[dict_index],
+                [possible_answers[answer_index][dict_index]],
+            )
+            if not result["valid"]:
+                flag = False
+                break
+        if flag:
+            return {"valid": True, "error": []}
+
+    return result
+
+
+def simple_function_checker(
+    func_description: dict,
+    model_output: dict,
+    possible_answer: dict,
+    language: str,
+    model_name: str,
+):
+    possible_answer = list(possible_answer.values())[0]
+    # Extract function name and parameters details
+    func_name = func_description["name"]
+    param_details = func_description["parameters"]["properties"]
+    required_params = func_description["parameters"]["required"]
+
+    # Initialize a result dictionary
+    result = {
+        "valid": True,
+        "error": [],
+        "error_type": "simple_function_checker:unclear",
+    }
+
+    # Check if function name matches
+    if func_name not in model_output:
+        result["valid"] = False
+        result["error"].append(  # type: ignore[attr-defined]
+            f"Function name {repr(func_name)} not found in model output."
+        )
+        result["error_type"] = "simple_function_checker:wrong_func_name"
+        return result
+
+    model_params = model_output[func_name]
+
+    # Check for required parameters in model output
+    for param in required_params:
+        if param not in model_params:
+            result["valid"] = False
+            result["error"].append(f"Missing required parameter: {repr(param)}.")  # type: ignore[attr-defined]
+            result["error_type"] = "simple_function_checker:missing_required"
+            return result
+
+    # Validate types and values for each parameter in model output
+    for param, value in model_params.items():
+        if param not in param_details or param not in possible_answer:
+            result["valid"] = False
+            result["error"].append(f"Unexpected parameter: {repr(param)}.")  # type: ignore[attr-defined]
+            result["error_type"] = "simple_function_checker:unexpected_param"
+            return result
+
+        full_param_details = param_details[param]
+        expected_type_description = full_param_details["type"]  # This is a string
+        is_variable = False
+        nested_type_converted = None
+
+        if language == "Java":
+            from evals.utils.bfcl.java_type_converter import java_type_converter
+
+            expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description]
+
+            if expected_type_description in JAVA_TYPE_CONVERSION:
+                if type(value) != str:
+                    result["valid"] = False
+                    result["error"].append(  # type: ignore[attr-defined]
+                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
+                    )
+                    result["error_type"] = "type_error:java"
+                    return result
+
+                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
+                    nested_type = param_details[param]["items"]["type"]
+                    nested_type_converted = JAVA_TYPE_CONVERSION[nested_type]
+                    value = java_type_converter(value, expected_type_description, nested_type)
+                else:
+                    value = java_type_converter(value, expected_type_description)
+
+        elif language == "JavaScript":
+            from evals.utils.bfcl.js_type_converter import js_type_converter
+
+            expected_type_converted = JS_TYPE_CONVERSION[expected_type_description]
+
+            if expected_type_description in JS_TYPE_CONVERSION:
+                if type(value) != str:
+                    result["valid"] = False
+                    result["error"].append(  # type: ignore[attr-defined]
+                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
+                    )
+                    result["error_type"] = "type_error:js"
+                    return result
+
+                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
+                    nested_type = param_details[param]["items"]["type"]
+                    nested_type_converted = JS_TYPE_CONVERSION[nested_type]
+                    value = js_type_converter(value, expected_type_description, nested_type)
+                else:
+                    value = js_type_converter(value, expected_type_description)
+
+        elif language == "Python":
+            expected_type_converted = PYTHON_TYPE_MAPPING[expected_type_description]
+            if expected_type_description in PYTHON_NESTED_TYPE_CHECK_LIST:
+                nested_type = param_details[param]["items"]["type"]
+                nested_type_converted = PYTHON_TYPE_MAPPING[nested_type]
+
+        # We convert all tuple value to list when the expected type is tuple.
+        # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load().
+        # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future.
+        if expected_type_description == "tuple" and type(value) == tuple:
+            value = list(value)
+
+        # Allow python auto conversion from int to float
+        if language == "Python" and expected_type_description == "float" and type(value) == int:
+            value = float(value)
+
+        # Type checking
+        # In fact, we only check for Python here.
+        # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct.
+        type_check_result = type_checker(
+            param,
+            value,
+            possible_answer[param],
+            expected_type_description,
+            expected_type_converted,
+            nested_type_converted,
+        )
+        is_variable = type_check_result["is_variable"]
+        if not type_check_result["valid"]:
+            return type_check_result
+
+        # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable.
+        # We can just treat the variable as a string and use the normal flow.
+        if not is_variable:
+            # Special handle for dictionaries
+            if expected_type_converted == dict:
+                result = dict_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            # Special handle for list of dictionaries
+            elif expected_type_converted == list and nested_type_converted == dict:
+                result = list_dict_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            # Special handle for strings
+            elif expected_type_converted == str:
+                # We don't check for case sensitivity for string, as long as it's not a variable
+                result = string_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            elif expected_type_converted == list:
+                result = list_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+        # Check if the value is within the possible answers
+        if value not in possible_answer[param]:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}."
+            )
+            result["error_type"] = "value_error:others"
+            return result
+
+    # Check for optional parameters not provided but allowed
+    for param in possible_answer:
+        if param not in model_params and "" not in possible_answer[param]:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Optional parameter {repr(param)} not provided and not marked as optional."
+            )
+            result["error_type"] = "simple_function_checker:missing_optional"
+            return result
+
+    return result
+
+
+def parallel_function_checker_enforce_order(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: dict,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "parallel_function_checker_enforce_order:wrong_count",
+        }
+
+    func_name_list = list(possible_answers.keys())
+    possible_answers_list = []
+
+    for key, value in possible_answers.items():
+        possible_answers_list.append({key: value})
+
+    for i in range(len(possible_answers_list)):
+        func_description = find_description(func_descriptions, func_name_list[i])
+
+        result = simple_function_checker(
+            func_description,
+            model_output[i],
+            possible_answers_list[i],
+            language,
+            model_name,
+        )
+        if not result["valid"]:
+            return result
+
+    return {"valid": True, "error": []}
+
+
+def parallel_function_checker_no_order(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: list,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "parallel_function_checker_no_order:wrong_count",
+        }
+
+    matched_indices = []
+
+    # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer
+    # It must be this way because we need ground truth to fetch the correct function description
+    for i in range(len(possible_answers)):
+        # possible_answers[i] is a dictionary with only one key
+        func_name_expected = list(possible_answers[i].keys())[0]
+        func_description = find_description(func_descriptions, func_name_expected)
+
+        all_errors = []
+
+        for index in range(len(model_output)):
+            if index in matched_indices:
+                continue
+
+            result = simple_function_checker(
+                func_description,
+                model_output[index],
+                possible_answers[i],
+                language,
+                model_name,
+            )
+
+            if result["valid"]:
+                matched_indices.append(index)
+                break
+            else:
+                all_errors.append(
+                    {
+                        f"Model Result Index {index}": {
+                            "sub_error": result["error"],
+                            "sub_error_type": result["error_type"],
+                            "model_output_item": model_output[index],
+                            "possible_answer_item": possible_answers[i],
+                        }
+                    }
+                )
+
+        if not result["valid"]:
+            considered_indices = [i for i in range(len(model_output)) if i not in matched_indices]
+            all_errors.insert(
+                0,
+                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
+            )
+            return {
+                "valid": False,
+                "error": all_errors,
+                "error_type": "parallel_function_checker_no_order:cannot_find_match",
+            }
+
+    return {"valid": True, "error": []}
+
+
+def multiple_function_checker(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: list,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "multiple_function_checker:wrong_count",
+        }
+
+    # possible_answers is a list of only one dictionary with only one key
+    func_name_expected = list(possible_answers[0].keys())[0]
+    func_description = find_description(func_descriptions, func_name_expected)
+    return simple_function_checker(
+        func_description,
+        model_output[0],
+        possible_answers[0],
+        language,
+        model_name,
+    )
+
+
+def patten_matcher(exec_output, expected_result, function_call, is_sanity_check):
+    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+    if type(exec_output) != type(expected_result):
+        return {
+            "valid": False,
+            "error": [
+                f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}."
+            ],
+            "error_type": "executable_checker:wrong_result_type",
+            "model_executed_output": exec_output,
+        }
+    if type(exec_output) == dict:
+        # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one.
+        # This happens when the key is a timestamp or a random number.
+        if is_sanity_check:
+            if len(exec_output) != len(expected_result):
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_length",
+                    "model_executed_output": exec_output,
+                }
+            else:
+                return result
+
+        for key, value in expected_result.items():
+            if key not in exec_output:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_key_not_found",
+                    "model_executed_output": exec_output,
+                }
+        for key, value in exec_output.items():
+            if key not in expected_result:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_extra_key",
+                    "model_executed_output": exec_output,
+                }
+    if type(exec_output) == list:
+        if len(exec_output) != len(expected_result):
+            return {
+                "valid": False,
+                "error": [
+                    f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
+                ],
+                "error_type": "executable_checker:wrong_result_type:list_length",
+                "model_executed_output": exec_output,
+            }
+    return result
+
+
+#### Helper functions for Exec ####
+def executable_checker_simple(
+    function_call: str,
+    expected_result,
+    expected_result_type: str,
+    is_sanity_check=False,
+):
+    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+    exec_dict: Any = {}
+
+    try:
+        exec(
+            "from executable_python_function import *" + "\nresult=" + function_call,
+            exec_dict,
+        )
+        exec_output = exec_dict["result"]
+    except NoAPIKeyError as e:
+        raise e
+    except Exception as e:
+        result["valid"] = False
+        result["error"].append(  # type: ignore[attr-defined]
+            f"Error in execution: {repr(function_call)}. Error: {str(e)}"
+        )
+        result["error_type"] = "executable_checker:execution_error"
+        return result
+
+    # We need to special handle the case where the execution result is a tuple and convert it to a list
+    # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json
+    if isinstance(exec_output, tuple):
+        exec_output = list(exec_output)
+
+    if expected_result_type == "exact_match":
+        if exec_output != expected_result:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}."
+            )
+            result["error_type"] = "executable_checker:wrong_result"
+            result["model_executed_output"] = exec_output
+            return result
+
+    elif expected_result_type == "real_time_match":
+        # Allow for 5% difference
+        if (type(expected_result) == float or type(expected_result) == int) and (
+            type(exec_output) == float or type(exec_output) == int
+        ):
+            if not (
+                expected_result * (1 - REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
+                <= exec_output
+                <= expected_result * (1 + REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
+            ):
+                result["valid"] = False
+                result["error"].append(  # type: ignore[attr-defined]
+                    f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. {REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed."
+                )
+                result["error_type"] = "executable_checker:wrong_result_real_time"
+                result["model_executed_output"] = exec_output
+                return result
+        else:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. Type needs to be float or int for real time match criteria."
+            )
+            result["error_type"] = "executable_checker:wrong_result_real_time"
+            result["model_executed_output"] = exec_output
+            return result
+
+    else:
+        # structural match
+        pattern_match_result = patten_matcher(exec_output, expected_result, function_call, is_sanity_check)
+        if not pattern_match_result["valid"]:
+            return pattern_match_result
+
+    return result
+
+
+def executable_checker_parallel_no_order(
+    decoded_result: list, expected_exec_result: list, expected_exec_result_type: list
+):
+    if len(decoded_result) != len(expected_exec_result):
+        return {
+            "valid": False,
+            "error": [
+                f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}."
+            ],
+            "error_type": "value_error:exec_result_count",
+        }
+
+    matched_indices = []
+    for i in range(len(expected_exec_result)):
+        all_errors = []
+        for index in range(len(decoded_result)):
+            if index in matched_indices:
+                continue
+
+            result = executable_checker_simple(
+                decoded_result[index],
+                expected_exec_result[i],
+                expected_exec_result_type[i],
+                False,
+            )
+
+            if result["valid"]:
+                matched_indices.append(index)
+                break
+            else:
+                all_errors.append(
+                    {
+                        f"Model Result Index {index}": {
+                            "sub_error": result["error"],
+                            "sub_error_type": result["error_type"],
+                            "model_executed_output": (
+                                result["model_executed_output"] if "model_executed_output" in result else None
+                            ),
+                        }
+                    }
+                )
+
+        if not result["valid"]:
+            considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices]
+            all_errors.insert(
+                0,
+                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
+            )
+            return {
+                "valid": False,
+                "error": all_errors,
+                "error_type": "executable_checker:cannot_find_match",
+            }
+
+    return {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+
+#### Main function ####
+def executable_checker_rest(func_call, idx):
+    # Move this here for now to avoid needing to read this file / fix paths to be relative to dataset_dir. Fix when it's actually needed / used.
+    EVAL_GROUND_TRUTH_PATH = "/mnt/wsfuse/fair_llm_v2/datasets/eval/bfcl/rest-eval-response_v5.jsonl"  # Ground truth file for v5 for rest execution
+    with open(EVAL_GROUND_TRUTH_PATH, "r") as f:
+        EVAL_GROUND_TRUTH = f.readlines()
+    if "https://geocode.maps.co" in func_call:
+        time.sleep(2)
+    if "requests_get" in func_call:
+        func_call = func_call.replace("requests_get", "requests.get")
+    try:
+        response = eval(func_call)
+    except Exception as e:
+        return {
+            "valid": False,
+            "error": [f"Execution failed. {str(e)}"],
+            "error_type": "executable_checker_rest:execution_error",
+        }
+
+    try:
+        if response.status_code == 200:
+            eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx])
+            try:
+                if isinstance(eval_GT_json, dict):
+                    if isinstance(response.json(), dict):
+                        if set(eval_GT_json.keys()) == set(response.json().keys()):
+                            return {"valid": True, "error": [], "error_type": ""}
+                        return {
+                            "valid": False,
+                            "error": ["Key inconsistency"],
+                            "error_type": "executable_checker_rest:wrong_key",
+                        }
+                    return {
+                        "valid": False,
+                        "error": [f"Expected dictionary, but got {type(response.json())}"],
+                        "error_type": "executable_checker_rest:wrong_type",
+                    }
+
+                elif isinstance(eval_GT_json, list):
+                    if isinstance(response.json(), list):
+                        if len(eval_GT_json) != len(response.json()):
+                            return {
+                                "valid": False,
+                                "error": [f"Response list length inconsistency."],
+                                "error_type": "value_error:exec_result_rest_count",
+                            }
+
+                        else:
+                            for i in range(len(eval_GT_json)):
+                                if set(eval_GT_json[i].keys()) != set(response.json()[i].keys()):
+                                    return {
+                                        "valid": False,
+                                        "error": [f"Key inconsistency"],
+                                        "error_type": "executable_checker_rest:wrong_key",
+                                    }
+
+                            return {"valid": True, "error": []}
+                    else:
+                        return {
+                            "valid": False,
+                            "error": [f"Expected list, but got {type(response.json())}"],
+                            "error_type": "executable_checker_rest:wrong_type",
+                        }
+                return {
+                    "valid": False,
+                    "error": [f"Expected dict or list, but got {type(response.json())}"],
+                    "error_type": "executable_checker_rest:wrong_type",
+                }
+            except Exception as e:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}"
+                    ],
+                    "error_type": "executable_checker_rest:response_format_error",
+                }
+        else:
+            return {
+                "valid": False,
+                "error": [f"Execution result status code is not 200, got {response.status_code}"],
+                "error_type": "executable_checker_rest:wrong_status_code",
+            }
+    except Exception as e:
+        return {
+            "valid": False,
+            "error": [f"Cannot get status code of the response. Error: {str(e)}"],
+            "error_type": "executable_checker_rest:cannot_get_status_code",
+        }
+
+
+def ast_checker(func_description, model_output, possible_answer, language, test_category, model_name):
+    if "parallel" in test_category:
+        return parallel_function_checker_no_order(func_description, model_output, possible_answer, language, model_name)
+
+    elif "multiple" in test_category:
+        return multiple_function_checker(func_description, model_output, possible_answer, language, model_name)
+
+    else:
+        if len(model_output) != 1:
+            return {
+                "valid": False,
+                "error": ["Wrong number of functions."],
+                "error_type": "simple_function_checker:wrong_count",
+            }
+
+        return simple_function_checker(
+            func_description[0],
+            model_output[0],
+            possible_answer[0],
+            language,
+            model_name,
+        )
+
+
+def exec_checker(decoded_result: list, func_description: dict, test_category: str):
+    if "multiple" in test_category or "parallel" in test_category:
+        return executable_checker_parallel_no_order(
+            decoded_result,
+            func_description["execution_result"],
+            func_description["execution_result_type"],
+        )
+
+    else:
+        if len(decoded_result) != 1:
+            return {
+                "valid": False,
+                "error": ["Wrong number of functions."],
+                "error_type": "simple_exec_checker:wrong_count",
+            }
+        return executable_checker_simple(
+            decoded_result[0],
+            func_description["execution_result"][0],
+            func_description["execution_result_type"][0],
+            False,
+        )
+
+
+def is_empty_output(decoded_output):
+    # This function is a patch to the ast decoder for relevance detection
+    # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call
+    # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct)
+    if not is_function_calling_format_output(decoded_output):
+        return True
+    if len(decoded_output) == 0:
+        return True
+    if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
+        return True
+
+
+def is_function_calling_format_output(decoded_output):
+    # Ensure the output is a list of dictionaries
+    if type(decoded_output) == list:
+        for item in decoded_output:
+            if type(item) != dict:
+                return False
+        return True
+    return False
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Tree-sitter changes its API with unfortunate frequency. Modules that need it should
+import it from here so that we can centrally manage things as necessary.
+"""
+
+# These currently work with tree-sitter 0.23.0
+# NOTE: Don't import tree-sitter or any of the language modules in the main module
+# because not all environments have them. Import lazily inside functions where needed.
+
+import importlib
+import typing
+
+if typing.TYPE_CHECKING:
+    import tree_sitter
+
+
+def get_language(language: str) -> "tree_sitter.Language":
+    import tree_sitter
+
+    language_module_name = f"tree_sitter_{language}"
+    try:
+        language_module = importlib.import_module(language_module_name)
+    except ModuleNotFoundError as exc:
+        raise ValueError(
+            f"Language {language} is not found. Please install the tree-sitter-{language} package."
+        ) from exc
+    return tree_sitter.Language(language_module.language())
+
+
+def get_parser(language: str, **kwargs) -> "tree_sitter.Parser":
+    import tree_sitter
+
+    lang = get_language(language)
+    return tree_sitter.Parser(lang, **kwargs)
--- a/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
--- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
@ -167,11 +167,11 @@ class BraintrustScoringImpl(
        dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
        validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.scoring.value))

-        all_rows = await self.datasetio_api.get_rows_paginated(
+        all_rows = await self.datasetio_api.iterrows(
            dataset_id=dataset_id,
-            rows_in_page=-1,
+            limit=-1,
        )
-        res = await self.score(input_rows=all_rows.rows, scoring_functions=scoring_functions)
+        res = await self.score(input_rows=all_rows.data, scoring_functions=scoring_functions)
        if save_results_dataset:
            # TODO: persist and register dataset on to server for reading
            # self.datasets_api.register_dataset()
--- a/llama_stack/providers/inline/scoring/llm_as_judge/config.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/config.py
@ -3,7 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel


-class LlmAsJudgeScoringConfig(BaseModel): ...
+class LlmAsJudgeScoringConfig(BaseModel):
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@ -72,12 +72,12 @@ class LlmAsJudgeScoringImpl(
        dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
        validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.scoring.value))

-        all_rows = await self.datasetio_api.get_rows_paginated(
+        all_rows = await self.datasetio_api.iterrows(
            dataset_id=dataset_id,
-            rows_in_page=-1,
+            limit=-1,
        )
        res = await self.score(
-            input_rows=all_rows.rows,
+            input_rows=all_rows.data,
            scoring_functions=scoring_functions,
        )
        if save_results_dataset:
--- a/llama_stack/providers/inline/telemetry/meta_reference/init.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/init.py
@ -6,12 +6,14 @@

 from typing import Any, Dict

+from llama_stack.distribution.datatypes import Api
+
 from .config import TelemetryConfig, TelemetrySink

 __all__ = ["TelemetryConfig", "TelemetrySink"]


-async def get_provider_impl(config: TelemetryConfig, deps: Dict[str, Any]):
+async def get_provider_impl(config: TelemetryConfig, deps: Dict[Api, Any]):
    from .telemetry import TelemetryAdapter

    impl = TelemetryAdapter(config, deps)
--- a/llama_stack/providers/inline/telemetry/meta_reference/config.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py
@ -13,18 +13,24 @@ from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR


 class TelemetrySink(str, Enum):
-    OTEL = "otel"
+    OTEL_TRACE = "otel_trace"
+    OTEL_METRIC = "otel_metric"
    SQLITE = "sqlite"
    CONSOLE = "console"


 class TelemetryConfig(BaseModel):
-    otel_endpoint: str = Field(
+    otel_trace_endpoint: str = Field(
        default="http://localhost:4318/v1/traces",
-        description="The OpenTelemetry collector endpoint URL",
+        description="The OpenTelemetry collector endpoint URL for traces",
+    )
+    otel_metric_endpoint: str = Field(
+        default="http://localhost:4318/v1/metrics",
+        description="The OpenTelemetry collector endpoint URL for metrics",
    )
    service_name: str = Field(
-        default="llama-stack",
+        # service name is always the same, use zero-width space to avoid clutter
+        default="",
        description="The service name to use for telemetry",
    )
    sinks: List[TelemetrySink] = Field(
@ -46,7 +52,7 @@ class TelemetryConfig(BaseModel):
    @classmethod
    def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]:
        return {
-            "service_name": "${env.OTEL_SERVICE_NAME:llama-stack}",
+            "service_name": "${env.OTEL_SERVICE_NAME:}",
            "sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
            "sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}",
        }
--- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import json
-from datetime import datetime
+from datetime import datetime, timezone

 from opentelemetry.sdk.trace import ReadableSpan
 from opentelemetry.sdk.trace.export import SpanProcessor
@ -34,7 +34,7 @@ class ConsoleSpanProcessor(SpanProcessor):
        if span.attributes and span.attributes.get("__autotraced__"):
            return

-        timestamp = datetime.utcfromtimestamp(span.start_time / 1e9).strftime("%H:%M:%S.%f")[:-3]
+        timestamp = datetime.fromtimestamp(span.start_time / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]

        print(
            f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
@ -46,7 +46,7 @@ class ConsoleSpanProcessor(SpanProcessor):
        if span.attributes and span.attributes.get("__autotraced__"):
            return

-        timestamp = datetime.utcfromtimestamp(span.end_time / 1e9).strftime("%H:%M:%S.%f")[:-3]
+        timestamp = datetime.fromtimestamp(span.end_time / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]

        span_context = (
            f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
@ -74,7 +74,7 @@ class ConsoleSpanProcessor(SpanProcessor):
                print(f"    {COLORS['dim']}{key}: {str_value}{COLORS['reset']}")

        for event in span.events:
-            event_time = datetime.utcfromtimestamp(event.timestamp / 1e9).strftime("%H:%M:%S.%f")[:-3]
+            event_time = datetime.fromtimestamp(event.timestamp / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]

            severity = event.attributes.get("severity", "info")
            message = event.attributes.get("message", event.name)
@ -101,6 +101,6 @@ class ConsoleSpanProcessor(SpanProcessor):
        """Shutdown the processor."""
        pass

-    def force_flush(self, timeout_millis: float = None) -> bool:
+    def force_flush(self, timeout_millis: float | None = None) -> bool:
        """Force flush any pending spans."""
        return True
--- a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
@ -8,10 +8,11 @@ import json
 import os
 import sqlite3
 import threading
-from datetime import datetime
+from datetime import datetime, timezone

 from opentelemetry.sdk.trace import SpanProcessor
 from opentelemetry.trace import Span
+from opentelemetry.trace.span import format_span_id, format_trace_id


 class SQLiteSpanProcessor(SpanProcessor):
@ -100,14 +101,14 @@ class SQLiteSpanProcessor(SpanProcessor):
            conn = self._get_connection()
            cursor = conn.cursor()

-            trace_id = format(span.get_span_context().trace_id, "032x")
-            span_id = format(span.get_span_context().span_id, "016x")
+            trace_id = format_trace_id(span.get_span_context().trace_id)
+            span_id = format_span_id(span.get_span_context().span_id)
            service_name = span.resource.attributes.get("service.name", "unknown")

            parent_span_id = None
            parent_context = span.parent
            if parent_context:
-                parent_span_id = format(parent_context.span_id, "016x")
+                parent_span_id = format_span_id(parent_context.span_id)

            # Insert into traces
            cursor.execute(
@ -123,9 +124,9 @@ class SQLiteSpanProcessor(SpanProcessor):
                (
                    trace_id,
                    service_name,
-                    (span_id if not parent_span_id else None),
-                    datetime.fromtimestamp(span.start_time / 1e9).isoformat(),
-                    datetime.fromtimestamp(span.end_time / 1e9).isoformat(),
+                    (span_id if span.attributes.get("__root_span__") == "true" else None),
+                    datetime.fromtimestamp(span.start_time / 1e9, timezone.utc).isoformat(),
+                    datetime.fromtimestamp(span.end_time / 1e9, timezone.utc).isoformat(),
                ),
            )

@ -143,8 +144,8 @@ class SQLiteSpanProcessor(SpanProcessor):
                    trace_id,
                    parent_span_id,
                    span.name,
-                    datetime.fromtimestamp(span.start_time / 1e9).isoformat(),
-                    datetime.fromtimestamp(span.end_time / 1e9).isoformat(),
+                    datetime.fromtimestamp(span.start_time / 1e9, timezone.utc).isoformat(),
+                    datetime.fromtimestamp(span.end_time / 1e9, timezone.utc).isoformat(),
                    json.dumps(dict(span.attributes)),
                    span.status.status_code.name,
                    span.kind.name,
@ -161,7 +162,7 @@ class SQLiteSpanProcessor(SpanProcessor):
                    (
                        span_id,
                        event.name,
-                        datetime.fromtimestamp(event.timestamp / 1e9).isoformat(),
+                        datetime.fromtimestamp(event.timestamp / 1e9, timezone.utc).isoformat(),
                        json.dumps(dict(event.attributes)),
                    ),
                )
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@ -44,7 +44,7 @@ from llama_stack.providers.utils.telemetry.sqlite_trace_store import SQLiteTrace

 from .config import TelemetryConfig, TelemetrySink

-_GLOBAL_STORAGE = {
+_GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = {
    "active_spans": {},
    "counters": {},
    "gauges": {},
@ -54,25 +54,16 @@ _global_lock = threading.Lock()
 _TRACER_PROVIDER = None


-def string_to_trace_id(s: str) -> int:
-    # Convert the string to bytes and then to an integer
-    return int.from_bytes(s.encode(), byteorder="big", signed=False)
-
-
-def string_to_span_id(s: str) -> int:
-    # Use only the first 8 bytes (64 bits) for span ID
-    return int.from_bytes(s.encode()[:8], byteorder="big", signed=False)
-
-
 def is_tracing_enabled(tracer):
    with tracer.start_as_current_span("check_tracing") as span:
        return span.is_recording()


 class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
-    def __init__(self, config: TelemetryConfig, deps: Dict[str, Any]) -> None:
+    def __init__(self, config: TelemetryConfig, deps: Dict[Api, Any]) -> None:
        self.config = config
        self.datasetio_api = deps.get(Api.datasetio)
+        self.meter = None

        resource = Resource.create(
            {
@ -90,15 +81,16 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
            provider = TracerProvider(resource=resource)
            trace.set_tracer_provider(provider)
            _TRACER_PROVIDER = provider
-            if TelemetrySink.OTEL in self.config.sinks:
-                otlp_exporter = OTLPSpanExporter(
-                    endpoint=self.config.otel_endpoint,
+            if TelemetrySink.OTEL_TRACE in self.config.sinks:
+                span_exporter = OTLPSpanExporter(
+                    endpoint=self.config.otel_trace_endpoint,
                )
-                span_processor = BatchSpanProcessor(otlp_exporter)
+                span_processor = BatchSpanProcessor(span_exporter)
                trace.get_tracer_provider().add_span_processor(span_processor)
+            if TelemetrySink.OTEL_METRIC in self.config.sinks:
                metric_reader = PeriodicExportingMetricReader(
                    OTLPMetricExporter(
-                        endpoint=self.config.otel_endpoint,
+                        endpoint=self.config.otel_metric_endpoint,
                    )
                )
                metric_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
@ -108,7 +100,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
            if TelemetrySink.CONSOLE in self.config.sinks:
                trace.get_tracer_provider().add_span_processor(ConsoleSpanProcessor())

-        if TelemetrySink.OTEL in self.config.sinks:
+        if TelemetrySink.OTEL_METRIC in self.config.sinks:
            self.meter = metrics.get_meter(__name__)
        if TelemetrySink.SQLITE in self.config.sinks:
            self.trace_store = SQLiteTraceStore(self.config.sqlite_db_path)
@ -134,7 +126,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
    def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None:
        with self._lock:
            # Use global storage instead of instance storage
-            span_id = string_to_span_id(event.span_id)
+            span_id = event.span_id
            span = _GLOBAL_STORAGE["active_spans"].get(span_id)

            if span:
@ -145,7 +137,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                        "message": event.message,
                        "severity": event.severity.value,
                        "__ttl__": ttl_seconds,
-                        **event.attributes,
+                        **(event.attributes or {}),
                    },
                    timestamp=timestamp_ns,
                )
@ -153,6 +145,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                print(f"Warning: No active span found for span_id {span_id}. Dropping event: {event}")

    def _get_or_create_counter(self, name: str, unit: str) -> metrics.Counter:
+        assert self.meter is not None
        if name not in _GLOBAL_STORAGE["counters"]:
            _GLOBAL_STORAGE["counters"][name] = self.meter.create_counter(
                name=name,
@ -162,6 +155,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
        return _GLOBAL_STORAGE["counters"][name]

    def _get_or_create_gauge(self, name: str, unit: str) -> metrics.ObservableGauge:
+        assert self.meter is not None
        if name not in _GLOBAL_STORAGE["gauges"]:
            _GLOBAL_STORAGE["gauges"][name] = self.meter.create_gauge(
                name=name,
@ -171,6 +165,8 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
        return _GLOBAL_STORAGE["gauges"][name]

    def _log_metric(self, event: MetricEvent) -> None:
+        if self.meter is None:
+            return
        if isinstance(event.value, int):
            counter = self._get_or_create_counter(event.metric, event.unit)
            counter.add(event.value, attributes=event.attributes)
@ -179,6 +175,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
            up_down_counter.add(event.value, attributes=event.attributes)

    def _get_or_create_up_down_counter(self, name: str, unit: str) -> metrics.UpDownCounter:
+        assert self.meter is not None
        if name not in _GLOBAL_STORAGE["up_down_counters"]:
            _GLOBAL_STORAGE["up_down_counters"][name] = self.meter.create_up_down_counter(
                name=name,
@ -189,8 +186,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):

    def _log_structured(self, event: StructuredLogEvent, ttl_seconds: int) -> None:
        with self._lock:
-            span_id = string_to_span_id(event.span_id)
-            trace_id = string_to_trace_id(event.trace_id)
+            span_id = int(event.span_id, 16)
            tracer = trace.get_tracer(__name__)
            if event.attributes is None:
                event.attributes = {}
@ -201,14 +197,13 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                if span_id in _GLOBAL_STORAGE["active_spans"]:
                    return

-                parent_span = None
+                context = None
                if event.payload.parent_span_id:
-                    parent_span_id = string_to_span_id(event.payload.parent_span_id)
+                    parent_span_id = int(event.payload.parent_span_id, 16)
                    parent_span = _GLOBAL_STORAGE["active_spans"].get(parent_span_id)
-
-                context = trace.Context(trace_id=trace_id)
-                if parent_span:
-                    context = trace.set_span_in_context(parent_span, context)
+                    context = trace.set_span_in_context(parent_span)
+                else:
+                    event.attributes["__root_span__"] = "true"

                span = tracer.start_span(
                    name=event.payload.name,
--- a/llama_stack/providers/inline/telemetry/sample/init.py
+++ b/llama_stack/providers/inline/telemetry/sample/init.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import SampleConfig
-
-
-async def get_adapter_impl(config: SampleConfig, _deps) -> Any:
-    from .sample import SampleTelemetryImpl
-
-    impl = SampleTelemetryImpl(config)
-    await impl.initialize()
-    return impl
--- a/llama_stack/providers/inline/telemetry/sample/sample.py
+++ b/llama_stack/providers/inline/telemetry/sample/sample.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.telemetry import Telemetry
-
-from .config import SampleConfig
-
-
-class SampleTelemetryImpl(Telemetry):
-    def __init__(self, config: SampleConfig):
-        self.config = config
-
-    async def initialize(self):
-        pass
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_env_prefix.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_env_prefix.py
@ -69,7 +69,7 @@ def popen_not_allowed(*args, **kwargs):
    )


-_subprocess.Popen = popen_not_allowed
+_subprocess.Popen = popen_not_allowed  # type: ignore


 import atexit as _atexit
@ -104,7 +104,7 @@ def _open_connections():
    return _NETWORK_CONNECTIONS


-_builtins._open_connections = _open_connections
+_builtins._open_connections = _open_connections  # type: ignore


@_atexit.register
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
@ -76,6 +76,7 @@ class CodeExecutionRequest:
    only_last_cell_fail: bool = True
    seed: int = 0
    strip_fpaths_in_stderr: bool = True
+    use_bwrap: bool = True


 class CodeExecutor:
@ -103,8 +104,6 @@ _set_seeds()\

        script = "\n\n".join([seeds_prefix] + [CODE_ENV_PREFIX] + scripts)
        with tempfile.TemporaryDirectory() as dpath:
-            bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath])
-            cmd = [*bwrap_prefix.split(), sys.executable, "-c", script]
            code_fpath = os.path.join(dpath, "code.py")
            with open(code_fpath, "w") as f:
                f.write(script)
@ -118,6 +117,13 @@ _set_seeds()\
                    MPLBACKEND="module://matplotlib_custom_backend",
                    PYTHONPATH=f"{DIRNAME}:{python_path}",
                )
+
+                if req.use_bwrap:
+                    bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath])
+                    cmd = [*bwrap_prefix.split(), sys.executable, "-c", script]
+                else:
+                    cmd = [sys.executable, "-c", script]
+
                stdout, stderr, returncode = do_subprocess(
                    cmd=cmd,
                    env=env,
@ -155,14 +161,14 @@ _set_seeds()\
 def process_matplotlib_response(response, matplotlib_dump_dir: str):
    image_data = response["image_data"]
    # Convert the base64 string to a bytes object
-    images = [base64.b64decode(d["image_base64"]) for d in image_data]
+    images_raw = [base64.b64decode(d["image_base64"]) for d in image_data]
    # Create a list of PIL images from the bytes objects
-    images = [Image.open(BytesIO(img)) for img in images]
+    images = [Image.open(BytesIO(img)) for img in images_raw]
    # Create a list of image paths
    image_paths = []
    for i, img in enumerate(images):
        # create new directory for each day to better organize data:
-        dump_dname = datetime.today().strftime("%Y-%m-%d")
+        dump_dname = datetime.today().strftime("%Y-%m-%d")  # noqa: DTZ002 - we don't care about timezones here since we are displaying the date
        dump_dpath = Path(matplotlib_dump_dir, dump_dname)
        dump_dpath.mkdir(parents=True, exist_ok=True)
        # save image into a file
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
@ -5,12 +5,15 @@
 # the root directory of this source tree.


+import asyncio
 import logging
+import os
 import tempfile
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional

 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import (
+    ListToolDefsResponse,
    Tool,
    ToolDef,
    ToolInvocationResult,
@ -36,7 +39,7 @@ class CodeInterpreterToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):
    async def initialize(self):
        pass

-    async def register_tool(self, tool: Tool):
+    async def register_tool(self, tool: Tool) -> None:
        pass

    async def unregister_tool(self, tool_id: str) -> None:
@ -44,25 +47,29 @@ class CodeInterpreterToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):

    async def list_runtime_tools(
        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
-    ) -> List[ToolDef]:
-        return [
-            ToolDef(
-                name="code_interpreter",
-                description="Execute code",
-                parameters=[
-                    ToolParameter(
-                        name="code",
-                        description="The code to execute",
-                        parameter_type="string",
-                    ),
-                ],
-            )
-        ]
+    ) -> ListToolDefsResponse:
+        return ListToolDefsResponse(
+            data=[
+                ToolDef(
+                    name="code_interpreter",
+                    description="Execute code",
+                    parameters=[
+                        ToolParameter(
+                            name="code",
+                            description="The code to execute",
+                            parameter_type="string",
+                        ),
+                    ],
+                )
+            ]
+        )

    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
        script = kwargs["code"]
-        req = CodeExecutionRequest(scripts=[script])
-        res = self.code_executor.execute(req)
+        # Use environment variable to control bwrap usage
+        force_disable_bwrap = os.environ.get("DISABLE_CODE_SANDBOX", "").lower() in ("1", "true", "yes")
+        req = CodeExecutionRequest(scripts=[script], use_bwrap=not force_disable_bwrap)
+        res = await asyncio.to_thread(self.code_executor.execute, req)
        pieces = [res["process_status"]]
        for out_type in ["stdout", "stderr"]:
            res_out = res[out_type]
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from pydantic import BaseModel


 class CodeInterpreterToolConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/tool_runtime/rag/init.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/init.py
@ -11,7 +11,7 @@ from llama_stack.providers.datatypes import Api
 from .config import RagToolRuntimeConfig


-async def get_provider_impl(config: RagToolRuntimeConfig, deps: Dict[str, Any]):
+async def get_provider_impl(config: RagToolRuntimeConfig, deps: Dict[Api, Any]):
    from .memory import MemoryToolRuntimeImpl

    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference], deps[Api.preprocessing])
--- a/llama_stack/providers/inline/tool_runtime/rag/config.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/config.py
@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from pydantic import BaseModel


 class RagToolRuntimeConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -15,6 +15,7 @@ from pydantic import TypeAdapter
 from llama_stack.apis.common.content_types import (
    URL,
    InterleavedContent,
+    InterleavedContentItem,
    TextContentItem,
 )
 from llama_stack.apis.inference import Inference
@ -27,10 +28,12 @@ from llama_stack.apis.preprocessing import (
    PreprocessorChainElement,
 )
 from llama_stack.apis.tools import (
+    ListToolDefsResponse,
    RAGDocument,
    RAGQueryConfig,
    RAGQueryResult,
    RAGToolRuntime,
+    Tool,
    ToolDef,
    ToolInvocationResult,
    ToolParameter,
@ -73,6 +76,12 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
    async def shutdown(self):
        pass

+    async def register_tool(self, tool: Tool) -> None:
+        pass
+
+    async def unregister_tool(self, tool_id: str) -> None:
+        return
+
    async def insert(
        self,
        documents: List[RAGDocument],
@ -103,7 +112,7 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):

        actual_chunks = [chunk.data_element_path_or_content for chunk in chunks]
        await self.vector_io_api.insert_chunks(
-            chunks=actual_chunks,
+            chunks=actual_chunks,  # type: ignore
            vector_db_id=vector_db_id,
        )

@ -140,11 +149,11 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
            return RAGQueryResult(content=None)

        # sort by score
-        chunks, scores = zip(*sorted(zip(chunks, scores, strict=False), key=lambda x: x[1], reverse=True), strict=False)
+        chunks, scores = zip(*sorted(zip(chunks, scores, strict=False), key=lambda x: x[1], reverse=True), strict=False)  # type: ignore
        chunks = chunks[: query_config.max_chunks]

        tokens = 0
-        picked = [
+        picked: list[InterleavedContentItem] = [
            TextContentItem(
                text=f"knowledge_search tool found {len(chunks)} chunks:\nBEGIN of knowledge_search tool results.\n"
            )
@ -173,27 +182,29 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):

    async def list_runtime_tools(
        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
-    ) -> List[ToolDef]:
+    ) -> ListToolDefsResponse:
        # Parameters are not listed since these methods are not yet invoked automatically
        # by the LLM. The method is only implemented so things like /tools can list without
        # encountering fatals.
-        return [
-            ToolDef(
-                name="insert_into_memory",
-                description="Insert documents into memory",
-            ),
-            ToolDef(
-                name="knowledge_search",
-                description="Search for information in a database.",
-                parameters=[
-                    ToolParameter(
-                        name="query",
-                        description="The query to search for. Can be a natural language sentence or keywords.",
-                        parameter_type="string",
-                    ),
-                ],
-            ),
-        ]
+        return ListToolDefsResponse(
+            data=[
+                ToolDef(
+                    name="insert_into_memory",
+                    description="Insert documents into memory",
+                ),
+                ToolDef(
+                    name="knowledge_search",
+                    description="Search for information in a database.",
+                    parameters=[
+                        ToolParameter(
+                            name="query",
+                            description="The query to search for. Can be a natural language sentence or keywords.",
+                            parameter_type="string",
+                        ),
+                    ],
+                ),
+            ]
+        )

    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
        vector_db_ids = kwargs.get("vector_db_ids", [])
--- a/llama_stack/providers/inline/vector_io/chroma/config.py
+++ b/llama_stack/providers/inline/vector_io/chroma/config.py
@ -13,5 +13,5 @@ class ChromaVectorIOConfig(BaseModel):
    db_path: str

    @classmethod
-    def sample_config(cls) -> Dict[str, Any]:
-        return {"db_path": "{env.CHROMADB_PATH}"}
+    def sample_run_config(cls, db_path: str = "${env.CHROMADB_PATH}", **kwargs: Any) -> Dict[str, Any]:
+        return {"db_path": db_path}
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -15,11 +15,13 @@ import faiss
 import numpy as np
 from numpy.typing import NDArray

-from llama_stack.apis.inference import InterleavedContent
+from llama_stack.apis.common.content_types import InterleavedContent
+from llama_stack.apis.inference.inference import Inference
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
+from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
 from llama_stack.providers.utils.kvstore import kvstore_impl
+from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.vector_store import (
    EmbeddingIndex,
    VectorDBWithIndex,
@ -35,16 +37,14 @@ FAISS_INDEX_PREFIX = f"faiss_index:{VERSION}::"


 class FaissIndex(EmbeddingIndex):
-    chunk_by_index: Dict[int, str]
-
-    def __init__(self, dimension: int, kvstore=None, bank_id: str = None):
+    def __init__(self, dimension: int, kvstore: KVStore | None = None, bank_id: str | None = None):
        self.index = faiss.IndexFlatL2(dimension)
-        self.chunk_by_index = {}
+        self.chunk_by_index: dict[int, Chunk] = {}
        self.kvstore = kvstore
        self.bank_id = bank_id

    @classmethod
-    async def create(cls, dimension: int, kvstore=None, bank_id: str = None):
+    async def create(cls, dimension: int, kvstore: KVStore | None = None, bank_id: str | None = None):
        instance = cls(dimension, kvstore, bank_id)
        await instance.initialize()
        return instance
@ -114,11 +114,11 @@ class FaissIndex(EmbeddingIndex):


 class FaissVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
-    def __init__(self, config: FaissVectorIOConfig, inference_api: Api.inference) -> None:
+    def __init__(self, config: FaissVectorIOConfig, inference_api: Inference) -> None:
        self.config = config
        self.inference_api = inference_api
-        self.cache = {}
-        self.kvstore = None
+        self.cache: dict[str, VectorDBWithIndex] = {}
+        self.kvstore: KVStore | None = None

    async def initialize(self) -> None:
        self.kvstore = await kvstore_impl(self.config.kvstore)
@ -144,6 +144,8 @@ class FaissVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        self,
        vector_db: VectorDB,
    ) -> None:
+        assert self.kvstore is not None
+
        key = f"{VECTOR_DBS_PREFIX}{vector_db.identifier}"
        await self.kvstore.set(
            key=key,
@ -161,6 +163,8 @@ class FaissVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        return [i.vector_db for i in self.cache.values()]

    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        assert self.kvstore is not None
+
        if vector_db_id not in self.cache:
            logger.warning(f"Vector DB {vector_db_id} not found")
            return
--- a/llama_stack/providers/inline/vector_io/qdrant/init.py
+++ b/llama_stack/providers/inline/vector_io/qdrant/init.py
@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Dict
+
+from llama_stack.providers.datatypes import Api, ProviderSpec
+
+from .config import QdrantVectorIOConfig
+
+
+async def get_adapter_impl(config: QdrantVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+    from llama_stack.providers.remote.vector_io.qdrant.qdrant import QdrantVectorIOAdapter
+
+    impl = QdrantVectorIOAdapter(config, deps[Api.inference])
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/vector_io/qdrant/config.py
+++ b/llama_stack/providers/inline/vector_io/qdrant/config.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from typing import Any, Dict
+
+from pydantic import BaseModel
+
+from llama_stack.schema_utils import json_schema_type
+
+
+@json_schema_type
+class QdrantVectorIOConfig(BaseModel):
+    path: str
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
+        return {
+            "path": "${env.QDRANT_PATH:~/.llama/" + __distro_dir__ + "}/" + "qdrant.db",
+        }
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import asyncio
 import hashlib
 import logging
 import sqlite3
@ -15,9 +16,10 @@ import numpy as np
 import sqlite_vec
 from numpy.typing import NDArray

+from llama_stack.apis.inference.inference import Inference
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
+from llama_stack.providers.datatypes import VectorDBsProtocolPrivate
 from llama_stack.providers.utils.memory.vector_store import EmbeddingIndex, VectorDBWithIndex

 logger = logging.getLogger(__name__)
@ -28,6 +30,15 @@ def serialize_vector(vector: List[float]) -> bytes:
    return struct.pack(f"{len(vector)}f", *vector)


+def _create_sqlite_connection(db_path):
+    """Create a SQLite connection with sqlite_vec extension loaded."""
+    connection = sqlite3.connect(db_path)
+    connection.enable_load_extension(True)
+    sqlite_vec.load(connection)
+    connection.enable_load_extension(False)
+    return connection
+
+
 class SQLiteVecIndex(EmbeddingIndex):
    """
    An index implementation that stores embeddings in a SQLite virtual table using sqlite-vec.
@ -36,40 +47,56 @@ class SQLiteVecIndex(EmbeddingIndex):
      - A virtual table (vec_chunks_{bank_id}) that holds the serialized vector.
    """

-    def __init__(self, dimension: int, connection: sqlite3.Connection, bank_id: str):
+    def __init__(self, dimension: int, db_path: str, bank_id: str):
        self.dimension = dimension
-        self.connection = connection
+        self.db_path = db_path
        self.bank_id = bank_id
        self.metadata_table = f"chunks_{bank_id}".replace("-", "_")
        self.vector_table = f"vec_chunks_{bank_id}".replace("-", "_")

    @classmethod
-    async def create(cls, dimension: int, connection: sqlite3.Connection, bank_id: str):
-        instance = cls(dimension, connection, bank_id)
+    async def create(cls, dimension: int, db_path: str, bank_id: str):
+        instance = cls(dimension, db_path, bank_id)
        await instance.initialize()
        return instance

    async def initialize(self) -> None:
-        cur = self.connection.cursor()
-        # Create the table to store chunk metadata.
-        cur.execute(f"""
-            CREATE TABLE IF NOT EXISTS {self.metadata_table} (
-                id TEXT PRIMARY KEY,
-                chunk TEXT
-            );
-        """)
-        # Create the virtual table for embeddings.
-        cur.execute(f"""
-            CREATE VIRTUAL TABLE IF NOT EXISTS {self.vector_table}
-            USING vec0(embedding FLOAT[{self.dimension}], id TEXT);
-        """)
-        self.connection.commit()
+        def _init_tables():
+            connection = _create_sqlite_connection(self.db_path)
+            cur = connection.cursor()
+            try:
+                # Create the table to store chunk metadata.
+                cur.execute(f"""
+                    CREATE TABLE IF NOT EXISTS {self.metadata_table} (
+                        id TEXT PRIMARY KEY,
+                        chunk TEXT
+                    );
+                """)
+                # Create the virtual table for embeddings.
+                cur.execute(f"""
+                    CREATE VIRTUAL TABLE IF NOT EXISTS {self.vector_table}
+                    USING vec0(embedding FLOAT[{self.dimension}], id TEXT);
+                """)
+                connection.commit()
+            finally:
+                cur.close()
+                connection.close()

-    async def delete(self):
-        cur = self.connection.cursor()
-        cur.execute(f"DROP TABLE IF EXISTS {self.metadata_table};")
-        cur.execute(f"DROP TABLE IF EXISTS {self.vector_table};")
-        self.connection.commit()
+        await asyncio.to_thread(_init_tables)
+
+    async def delete(self) -> None:
+        def _drop_tables():
+            connection = _create_sqlite_connection(self.db_path)
+            cur = connection.cursor()
+            try:
+                cur.execute(f"DROP TABLE IF EXISTS {self.metadata_table};")
+                cur.execute(f"DROP TABLE IF EXISTS {self.vector_table};")
+                connection.commit()
+            finally:
+                cur.close()
+                connection.close()
+
+        await asyncio.to_thread(_drop_tables)

    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray, batch_size: int = 500):
        """
@ -78,42 +105,57 @@ class SQLiteVecIndex(EmbeddingIndex):
        embedding (serialized to raw bytes) into the virtual table using the assigned rowid.
        If any insert fails, the transaction is rolled back to maintain consistency.
        """
-        cur = self.connection.cursor()
-        try:
-            # Start transaction
-            cur.execute("BEGIN TRANSACTION")
-            for i in range(0, len(chunks), batch_size):
-                batch_chunks = chunks[i : i + batch_size]
-                batch_embeddings = embeddings[i : i + batch_size]
-                # Prepare metadata inserts
-                metadata_data = [
-                    (generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.model_dump_json())
-                    for chunk in batch_chunks
-                ]
-                # Insert metadata (ON CONFLICT to avoid duplicates)
-                cur.executemany(
-                    f"""
-                    INSERT INTO {self.metadata_table} (id, chunk)
-                    VALUES (?, ?)
-                    ON CONFLICT(id) DO UPDATE SET chunk = excluded.chunk;
-                    """,
-                    metadata_data,
-                )
-                # Prepare embeddings inserts
-                embedding_data = [
-                    (generate_chunk_id(chunk.metadata["document_id"], chunk.content), serialize_vector(emb.tolist()))
-                    for chunk, emb in zip(batch_chunks, batch_embeddings, strict=True)
-                ]
-                # Insert embeddings in batch
-                cur.executemany(f"INSERT INTO {self.vector_table} (id, embedding) VALUES (?, ?);", embedding_data)
-            self.connection.commit()
+        assert all(isinstance(chunk.content, str) for chunk in chunks), "SQLiteVecIndex only supports text chunks"

-        except sqlite3.Error as e:
-            self.connection.rollback()  # Rollback on failure
-            logger.error(f"Error inserting into {self.vector_table}: {e}")
+        def _execute_all_batch_inserts():
+            connection = _create_sqlite_connection(self.db_path)
+            cur = connection.cursor()

-        finally:
-            cur.close()  # Ensure cursor is closed
+            try:
+                # Start transaction a single transcation for all batches
+                cur.execute("BEGIN TRANSACTION")
+                for i in range(0, len(chunks), batch_size):
+                    batch_chunks = chunks[i : i + batch_size]
+                    batch_embeddings = embeddings[i : i + batch_size]
+                    # Prepare metadata inserts
+                    metadata_data = [
+                        (generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.model_dump_json())
+                        for chunk in batch_chunks
+                        if isinstance(chunk.content, str)
+                    ]
+                    # Insert metadata (ON CONFLICT to avoid duplicates)
+                    cur.executemany(
+                        f"""
+                        INSERT INTO {self.metadata_table} (id, chunk)
+                        VALUES (?, ?)
+                        ON CONFLICT(id) DO UPDATE SET chunk = excluded.chunk;
+                        """,
+                        metadata_data,
+                    )
+                    # Prepare embeddings inserts
+                    embedding_data = [
+                        (
+                            generate_chunk_id(chunk.metadata["document_id"], chunk.content),
+                            serialize_vector(emb.tolist()),
+                        )
+                        for chunk, emb in zip(batch_chunks, batch_embeddings, strict=True)
+                        if isinstance(chunk.content, str)
+                    ]
+                    # Insert embeddings in batch
+                    cur.executemany(f"INSERT INTO {self.vector_table} (id, embedding) VALUES (?, ?);", embedding_data)
+                connection.commit()
+
+            except sqlite3.Error as e:
+                connection.rollback()  # Rollback on failure
+                logger.error(f"Error inserting into {self.vector_table}: {e}")
+                raise
+
+            finally:
+                cur.close()
+                connection.close()
+
+        # Process all batches in a single thread
+        await asyncio.to_thread(_execute_all_batch_inserts)

    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
        """
@ -122,18 +164,28 @@ class SQLiteVecIndex(EmbeddingIndex):
        """
        emb_list = embedding.tolist() if isinstance(embedding, np.ndarray) else list(embedding)
        emb_blob = serialize_vector(emb_list)
-        cur = self.connection.cursor()
-        query_sql = f"""
-            SELECT m.id, m.chunk, v.distance
-            FROM {self.vector_table} AS v
-            JOIN {self.metadata_table} AS m ON m.id = v.id
-            WHERE v.embedding MATCH ? AND k = ?
-            ORDER BY v.distance;
-        """
-        cur.execute(query_sql, (emb_blob, k))
-        rows = cur.fetchall()
-        chunks = []
-        scores = []
+
+        def _execute_query():
+            connection = _create_sqlite_connection(self.db_path)
+            cur = connection.cursor()
+
+            try:
+                query_sql = f"""
+                    SELECT m.id, m.chunk, v.distance
+                    FROM {self.vector_table} AS v
+                    JOIN {self.metadata_table} AS m ON m.id = v.id
+                    WHERE v.embedding MATCH ? AND k = ?
+                    ORDER BY v.distance;
+                """
+                cur.execute(query_sql, (emb_blob, k))
+                return cur.fetchall()
+            finally:
+                cur.close()
+                connection.close()
+
+        rows = await asyncio.to_thread(_execute_query)
+
+        chunks, scores = [], []
        for _id, chunk_json, distance in rows:
            try:
                chunk = Chunk.model_validate_json(chunk_json)
@ -154,67 +206,85 @@ class SQLiteVecVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    and creates a cache of VectorDBWithIndex instances (each wrapping a SQLiteVecIndex).
    """

-    def __init__(self, config, inference_api: Api.inference) -> None:
+    def __init__(self, config, inference_api: Inference) -> None:
        self.config = config
        self.inference_api = inference_api
        self.cache: Dict[str, VectorDBWithIndex] = {}
-        self.connection: Optional[sqlite3.Connection] = None

    async def initialize(self) -> None:
-        # Open a connection to the SQLite database (the file is specified in the config).
-        self.connection = sqlite3.connect(self.config.db_path)
-        self.connection.enable_load_extension(True)
-        sqlite_vec.load(self.connection)
-        self.connection.enable_load_extension(False)
-        cur = self.connection.cursor()
-        # Create a table to persist vector DB registrations.
-        cur.execute("""
-            CREATE TABLE IF NOT EXISTS vector_dbs (
-                id TEXT PRIMARY KEY,
-                metadata TEXT
-            );
-        """)
-        self.connection.commit()
-        # Load any existing vector DB registrations.
-        cur.execute("SELECT metadata FROM vector_dbs")
-        rows = cur.fetchall()
+        def _setup_connection():
+            # Open a connection to the SQLite database (the file is specified in the config).
+            connection = _create_sqlite_connection(self.config.db_path)
+            cur = connection.cursor()
+            try:
+                # Create a table to persist vector DB registrations.
+                cur.execute("""
+                    CREATE TABLE IF NOT EXISTS vector_dbs (
+                        id TEXT PRIMARY KEY,
+                        metadata TEXT
+                    );
+                """)
+                connection.commit()
+                # Load any existing vector DB registrations.
+                cur.execute("SELECT metadata FROM vector_dbs")
+                rows = cur.fetchall()
+                return rows
+            finally:
+                cur.close()
+                connection.close()
+
+        rows = await asyncio.to_thread(_setup_connection)
        for row in rows:
            vector_db_data = row[0]
            vector_db = VectorDB.model_validate_json(vector_db_data)
-            index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.connection, vector_db.identifier)
+            index = await SQLiteVecIndex.create(
+                vector_db.embedding_dimension, self.config.db_path, vector_db.identifier
+            )
            self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)

    async def shutdown(self) -> None:
-        if self.connection:
-            self.connection.close()
-            self.connection = None
+        # nothing to do since we don't maintain a persistent connection
+        pass

    async def register_vector_db(self, vector_db: VectorDB) -> None:
-        if self.connection is None:
-            raise RuntimeError("SQLite connection not initialized")
-        cur = self.connection.cursor()
-        cur.execute(
-            "INSERT OR REPLACE INTO vector_dbs (id, metadata) VALUES (?, ?)",
-            (vector_db.identifier, vector_db.model_dump_json()),
-        )
-        self.connection.commit()
-        index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.connection, vector_db.identifier)
+        def _register_db():
+            connection = _create_sqlite_connection(self.config.db_path)
+            cur = connection.cursor()
+            try:
+                cur.execute(
+                    "INSERT OR REPLACE INTO vector_dbs (id, metadata) VALUES (?, ?)",
+                    (vector_db.identifier, vector_db.model_dump_json()),
+                )
+                connection.commit()
+            finally:
+                cur.close()
+                connection.close()
+
+        await asyncio.to_thread(_register_db)
+        index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.config.db_path, vector_db.identifier)
        self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)

    async def list_vector_dbs(self) -> List[VectorDB]:
        return [v.vector_db for v in self.cache.values()]

    async def unregister_vector_db(self, vector_db_id: str) -> None:
-        if self.connection is None:
-            raise RuntimeError("SQLite connection not initialized")
        if vector_db_id not in self.cache:
            logger.warning(f"Vector DB {vector_db_id} not found")
            return
        await self.cache[vector_db_id].index.delete()
        del self.cache[vector_db_id]
-        cur = self.connection.cursor()
-        cur.execute("DELETE FROM vector_dbs WHERE id = ?", (vector_db_id,))
-        self.connection.commit()
+
+        def _delete_vector_db_from_registry():
+            connection = _create_sqlite_connection(self.config.db_path)
+            cur = connection.cursor()
+            try:
+                cur.execute("DELETE FROM vector_dbs WHERE id = ?", (vector_db_id,))
+                connection.commit()
+            finally:
+                cur.close()
+                connection.close()
+
+        await asyncio.to_thread(_delete_vector_db_from_registry)

    async def insert_chunks(self, vector_db_id: str, chunks: List[Chunk], ttl_seconds: Optional[int] = None) -> None:
        if vector_db_id not in self.cache: