pre-commit fixes

2026-01-02 04:20:02 +00:00 · 2025-03-14 13:56:05 -07:00 · 2025-03-14 13:56:05 -07:00 · 7e211f8553
commit 7e211f8553
parent 967dd0aa08
314 changed files with 5574 additions and 11369 deletions
--- a/llama_stack/providers/inline/agents/meta_reference/init.py
+++ b/llama_stack/providers/inline/agents/meta_reference/init.py
@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
+from typing import Any, Dict

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import MetaReferenceAgentsImplConfig


-async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: Dict[Api, Any]):
    from .agents import MetaReferenceAgentsImpl

    impl = MetaReferenceAgentsImpl(
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -12,12 +12,11 @@ import secrets
 import string
 import uuid
 from datetime import datetime
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
+from typing import AsyncGenerator, List, Optional, Union
 from urllib.parse import urlparse

 import httpx

-from llama_stack import logcat
 from llama_stack.apis.agents import (
    AgentConfig,
    AgentToolGroup,
@ -31,7 +30,6 @@ from llama_stack.apis.agents import (
    AgentTurnResponseStreamChunk,
    AgentTurnResponseTurnAwaitingInputPayload,
    AgentTurnResponseTurnCompletePayload,
-    AgentTurnResponseTurnStartPayload,
    AgentTurnResumeRequest,
    Attachment,
    Document,
@ -68,6 +66,7 @@ from llama_stack.apis.tools import (
    ToolRuntime,
 )
 from llama_stack.apis.vector_io import VectorIO
+from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    ToolCall,
@ -89,6 +88,8 @@ MEMORY_QUERY_TOOL = "knowledge_search"
 WEB_SEARCH_TOOL = "web_search"
 RAG_TOOL_GROUP = "builtin::rag"

+logger = get_logger(name=__name__, category="agents")
+

 class ChatAgent(ShieldRunnerMixin):
    def __init__(
@ -152,7 +153,6 @@ class ChatAgent(ShieldRunnerMixin):
                    messages.append(
                        ToolResponseMessage(
                            call_id=response.call_id,
-                            tool_name=response.tool_name,
                            content=response.content,
                        )
                    )
@ -180,120 +180,58 @@ class ChatAgent(ShieldRunnerMixin):
        return messages

    async def create_and_execute_turn(self, request: AgentTurnCreateRequest) -> AsyncGenerator:
-        with tracing.span("create_and_execute_turn") as span:
+        await self._initialize_tools(request.toolgroups)
+        async with tracing.span("create_and_execute_turn") as span:
            span.set_attribute("session_id", request.session_id)
            span.set_attribute("agent_id", self.agent_id)
            span.set_attribute("request", request.model_dump_json())
-            assert request.stream is True, "Non-streaming not supported"
-
-            session_info = await self.storage.get_session_info(request.session_id)
-            if session_info is None:
-                raise ValueError(f"Session {request.session_id} not found")
-
-            turns = await self.storage.get_session_turns(request.session_id)
-            messages = await self.get_messages_from_turns(turns)
-            messages.extend(request.messages)
-
            turn_id = str(uuid.uuid4())
            span.set_attribute("turn_id", turn_id)
-            start_time = datetime.now().astimezone().isoformat()
-            yield AgentTurnResponseStreamChunk(
-                event=AgentTurnResponseEvent(
-                    payload=AgentTurnResponseTurnStartPayload(
-                        turn_id=turn_id,
-                    )
-                )
-            )
-
-            steps = []
-            output_message = None
-            async for chunk in self.run(
-                session_id=request.session_id,
-                turn_id=turn_id,
-                input_messages=messages,
-                sampling_params=self.agent_config.sampling_params,
-                stream=request.stream,
-                documents=request.documents,
-                toolgroups_for_turn=request.toolgroups,
-            ):
-                if isinstance(chunk, CompletionMessage):
-                    logcat.info(
-                        "agents",
-                        f"returning result from the agent turn: {chunk}",
-                    )
-                    output_message = chunk
-                    continue
-
-                assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}"
-                event = chunk.event
-                if event.payload.event_type == AgentTurnResponseEventType.step_complete.value:
-                    steps.append(event.payload.step_details)
-
+            async for chunk in self._run_turn(request, turn_id):
                yield chunk

-            assert output_message is not None
-
-            turn = Turn(
-                turn_id=turn_id,
-                session_id=request.session_id,
-                input_messages=request.messages,
-                output_message=output_message,
-                started_at=start_time,
-                completed_at=datetime.now().astimezone().isoformat(),
-                steps=steps,
-            )
-            await self.storage.add_turn_to_session(request.session_id, turn)
-
-            if output_message.tool_calls and request.allow_turn_resume:
-                chunk = AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseTurnAwaitingInputPayload(
-                            turn=turn,
-                        )
-                    )
-                )
-            else:
-                chunk = AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseTurnCompletePayload(
-                            turn=turn,
-                        )
-                    )
-                )
-
-            yield chunk
-
    async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator:
-        with tracing.span("resume_turn") as span:
+        await self._initialize_tools()
+        async with tracing.span("resume_turn") as span:
            span.set_attribute("agent_id", self.agent_id)
            span.set_attribute("session_id", request.session_id)
            span.set_attribute("turn_id", request.turn_id)
            span.set_attribute("request", request.model_dump_json())
-            assert request.stream is True, "Non-streaming not supported"
+            async for chunk in self._run_turn(request):
+                yield chunk

-            session_info = await self.storage.get_session_info(request.session_id)
-            if session_info is None:
-                raise ValueError(f"Session {request.session_id} not found")
+    async def _run_turn(
+        self,
+        request: Union[AgentTurnCreateRequest, AgentTurnResumeRequest],
+        turn_id: Optional[str] = None,
+    ) -> AsyncGenerator:
+        assert request.stream is True, "Non-streaming not supported"

-            turns = await self.storage.get_session_turns(request.session_id)
-            if len(turns) == 0:
-                raise ValueError("No turns found for session")
+        is_resume = isinstance(request, AgentTurnResumeRequest)
+        session_info = await self.storage.get_session_info(request.session_id)
+        if session_info is None:
+            raise ValueError(f"Session {request.session_id} not found")

-            messages = await self.get_messages_from_turns(turns)
-            messages.extend(request.tool_responses)
+        turns = await self.storage.get_session_turns(request.session_id)
+        if is_resume and len(turns) == 0:
+            raise ValueError("No turns found for session")

+        steps = []
+        messages = await self.get_messages_from_turns(turns)
+        if is_resume:
+            tool_response_messages = [
+                ToolResponseMessage(call_id=x.call_id, content=x.content) for x in request.tool_responses
+            ]
+            messages.extend(tool_response_messages)
            last_turn = turns[-1]
            last_turn_messages = self.turn_to_messages(last_turn)
            last_turn_messages = [
                x for x in last_turn_messages if isinstance(x, UserMessage) or isinstance(x, ToolResponseMessage)
            ]
+            last_turn_messages.extend(tool_response_messages)

-            # TODO: figure out whether we should add the tool responses to the last turn messages
-            last_turn_messages.extend(request.tool_responses)
-
-            # get the steps from the turn id
-            steps = []
-            steps = turns[-1].steps
+            # get steps from the turn
+            steps = last_turn.steps

            # mark tool execution step as complete
            # if there's no tool execution in progress step (due to storage, or tool call parsing on client),
@ -306,14 +244,7 @@ class ChatAgent(ShieldRunnerMixin):
                step_id=(in_progress_tool_call_step.step_id if in_progress_tool_call_step else str(uuid.uuid4())),
                turn_id=request.turn_id,
                tool_calls=(in_progress_tool_call_step.tool_calls if in_progress_tool_call_step else []),
-                tool_responses=[
-                    ToolResponse(
-                        call_id=x.call_id,
-                        tool_name=x.tool_name,
-                        content=x.content,
-                    )
-                    for x in request.tool_responses
-                ],
+                tool_responses=request.tool_responses,
                completed_at=now,
                started_at=(in_progress_tool_call_step.started_at if in_progress_tool_call_step else now),
            )
@ -327,62 +258,66 @@ class ChatAgent(ShieldRunnerMixin):
                    )
                )
            )
+            input_messages = last_turn_messages

-            output_message = None
-            async for chunk in self.run(
-                session_id=request.session_id,
-                turn_id=request.turn_id,
-                input_messages=messages,
-                sampling_params=self.agent_config.sampling_params,
-                stream=request.stream,
-            ):
-                if isinstance(chunk, CompletionMessage):
-                    output_message = chunk
-                    continue
+            turn_id = request.turn_id
+            start_time = last_turn.started_at
+        else:
+            messages.extend(request.messages)
+            start_time = datetime.now().astimezone().isoformat()
+            input_messages = request.messages

-                assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}"
-                event = chunk.event
-                if event.payload.event_type == AgentTurnResponseEventType.step_complete.value:
-                    steps.append(event.payload.step_details)
+        output_message = None
+        async for chunk in self.run(
+            session_id=request.session_id,
+            turn_id=turn_id,
+            input_messages=messages,
+            sampling_params=self.agent_config.sampling_params,
+            stream=request.stream,
+            documents=request.documents if not is_resume else None,
+        ):
+            if isinstance(chunk, CompletionMessage):
+                output_message = chunk
+                continue

-                yield chunk
-
-            assert output_message is not None
-
-            last_turn_start_time = datetime.now().astimezone().isoformat()
-            if len(turns) > 0:
-                last_turn_start_time = turns[-1].started_at
-
-            turn = Turn(
-                turn_id=request.turn_id,
-                session_id=request.session_id,
-                input_messages=last_turn_messages,
-                output_message=output_message,
-                started_at=last_turn_start_time,
-                completed_at=datetime.now().astimezone().isoformat(),
-                steps=steps,
-            )
-            await self.storage.add_turn_to_session(request.session_id, turn)
-
-            if output_message.tool_calls:
-                chunk = AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseTurnAwaitingInputPayload(
-                            turn=turn,
-                        )
-                    )
-                )
-            else:
-                chunk = AgentTurnResponseStreamChunk(
-                    event=AgentTurnResponseEvent(
-                        payload=AgentTurnResponseTurnCompletePayload(
-                            turn=turn,
-                        )
-                    )
-                )
+            assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}"
+            event = chunk.event
+            if event.payload.event_type == AgentTurnResponseEventType.step_complete.value:
+                steps.append(event.payload.step_details)

            yield chunk

+        assert output_message is not None
+
+        turn = Turn(
+            turn_id=turn_id,
+            session_id=request.session_id,
+            input_messages=input_messages,
+            output_message=output_message,
+            started_at=start_time,
+            completed_at=datetime.now().astimezone().isoformat(),
+            steps=steps,
+        )
+        await self.storage.add_turn_to_session(request.session_id, turn)
+        if output_message.tool_calls:
+            chunk = AgentTurnResponseStreamChunk(
+                event=AgentTurnResponseEvent(
+                    payload=AgentTurnResponseTurnAwaitingInputPayload(
+                        turn=turn,
+                    )
+                )
+            )
+        else:
+            chunk = AgentTurnResponseStreamChunk(
+                event=AgentTurnResponseEvent(
+                    payload=AgentTurnResponseTurnCompletePayload(
+                        turn=turn,
+                    )
+                )
+            )
+
+        yield chunk
+
    async def run(
        self,
        session_id: str,
@ -391,7 +326,6 @@ class ChatAgent(ShieldRunnerMixin):
        sampling_params: SamplingParams,
        stream: bool = False,
        documents: Optional[List[Document]] = None,
-        toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
    ) -> AsyncGenerator:
        # Doing async generators makes downstream code much simpler and everything amenable to
        # streaming. However, it also makes things complicated here because AsyncGenerators cannot
@ -414,7 +348,6 @@ class ChatAgent(ShieldRunnerMixin):
            sampling_params,
            stream,
            documents,
-            toolgroups_for_turn,
        ):
            if isinstance(res, bool):
                return
@ -446,7 +379,7 @@ class ChatAgent(ShieldRunnerMixin):
        shields: List[str],
        touchpoint: str,
    ) -> AsyncGenerator:
-        with tracing.span("run_shields") as span:
+        async with tracing.span("run_shields") as span:
            span.set_attribute("input", [m.model_dump_json() for m in messages])
            if len(shields) == 0:
                span.set_attribute("output", "no shields")
@ -515,27 +448,19 @@ class ChatAgent(ShieldRunnerMixin):
        sampling_params: SamplingParams,
        stream: bool = False,
        documents: Optional[List[Document]] = None,
-        toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
    ) -> AsyncGenerator:
-        # TODO: simplify all of this code, it can be simpler
-        toolgroup_args = {}
-        toolgroups = set()
-        for toolgroup in self.agent_config.toolgroups + (toolgroups_for_turn or []):
-            if isinstance(toolgroup, AgentToolGroupWithArgs):
-                tool_group_name, tool_name = self._parse_toolgroup_name(toolgroup.name)
-                toolgroups.add(tool_group_name)
-                toolgroup_args[tool_group_name] = toolgroup.args
-            else:
-                toolgroups.add(toolgroup)
-
-        tool_defs, tool_to_group = await self._get_tool_defs(toolgroups_for_turn)
        if documents:
-            await self.handle_documents(session_id, documents, input_messages, tool_defs)
+            await self.handle_documents(session_id, documents, input_messages)

        session_info = await self.storage.get_session_info(session_id)
        # if the session has a memory bank id, let the memory tool use it
        if session_info and session_info.vector_db_id:
-            toolgroup_args[RAG_TOOL_GROUP]["vector_db_ids"].append(session_info.vector_db_id)
+            for tool_name in self.tool_name_to_args.keys():
+                if tool_name == MEMORY_QUERY_TOOL:
+                    if "vector_db_ids" not in self.tool_name_to_args[tool_name]:
+                        self.tool_name_to_args[tool_name]["vector_db_ids"] = [session_info.vector_db_id]
+                    else:
+                        self.tool_name_to_args[tool_name]["vector_db_ids"].append(session_info.vector_db_id)

        output_attachments = []

@ -561,11 +486,11 @@ class ChatAgent(ShieldRunnerMixin):
            content = ""
            stop_reason = None

-            with tracing.span("inference") as span:
+            async with tracing.span("inference") as span:
                async for chunk in await self.inference_api.chat_completion(
                    self.agent_config.model,
                    input_messages,
-                    tools=tool_defs,
+                    tools=self.tool_defs,
                    tool_prompt_format=self.agent_config.tool_config.tool_prompt_format,
                    response_format=self.agent_config.response_format,
                    stream=True,
@ -664,7 +589,7 @@ class ChatAgent(ShieldRunnerMixin):
            )

            if n_iter >= self.agent_config.max_infer_iters:
-                logcat.info("agents", f"done with MAX iterations ({n_iter}), exiting.")
+                logger.info(f"done with MAX iterations ({n_iter}), exiting.")
                # NOTE: mark end_of_turn to indicate to client that we are done with the turn
                # Do not continue the tool call loop after this point
                message.stop_reason = StopReason.end_of_turn
@ -672,7 +597,7 @@ class ChatAgent(ShieldRunnerMixin):
                break

            if stop_reason == StopReason.out_of_tokens:
-                logcat.info("agents", "out of token budget, exiting.")
+                logger.info("out of token budget, exiting.")
                yield message
                break

@ -686,10 +611,10 @@ class ChatAgent(ShieldRunnerMixin):
                            message.content = [message.content] + output_attachments
                    yield message
                else:
-                    logcat.debug("agents", f"completion message with EOM (iter: {n_iter}): {str(message)}")
+                    logger.debug(f"completion message with EOM (iter: {n_iter}): {str(message)}")
                    input_messages = input_messages + [message]
            else:
-                logcat.debug("agents", f"completion message (iter: {n_iter}) from the model: {str(message)}")
+                logger.debug(f"completion message (iter: {n_iter}) from the model: {str(message)}")
                # 1. Start the tool execution step and progress
                step_id = str(uuid.uuid4())
                yield AgentTurnResponseStreamChunk(
@ -738,7 +663,7 @@ class ChatAgent(ShieldRunnerMixin):
                tool_name = tool_call.tool_name
                if isinstance(tool_name, BuiltinTool):
                    tool_name = tool_name.value
-                with tracing.span(
+                async with tracing.span(
                    "tool_execution",
                    {
                        "tool_name": tool_name,
@ -747,12 +672,9 @@ class ChatAgent(ShieldRunnerMixin):
                ) as span:
                    tool_execution_start_time = datetime.now().astimezone().isoformat()
                    tool_call = message.tool_calls[0]
-                    tool_result = await execute_tool_call_maybe(
-                        self.tool_runtime_api,
+                    tool_result = await self.execute_tool_call_maybe(
                        session_id,
                        tool_call,
-                        toolgroup_args,
-                        tool_to_group,
                    )
                    if tool_result.content is None:
                        raise ValueError(
@ -761,7 +683,6 @@ class ChatAgent(ShieldRunnerMixin):
                    result_messages = [
                        ToolResponseMessage(
                            call_id=tool_call.call_id,
-                            tool_name=tool_call.tool_name,
                            content=tool_result.content,
                        )
                    ]
@ -781,7 +702,7 @@ class ChatAgent(ShieldRunnerMixin):
                                tool_responses=[
                                    ToolResponse(
                                        call_id=result_message.call_id,
-                                        tool_name=result_message.tool_name,
+                                        tool_name=tool_call.tool_name,
                                        content=result_message.content,
                                        metadata=tool_result.metadata,
                                    )
@ -805,9 +726,16 @@ class ChatAgent(ShieldRunnerMixin):

                input_messages = input_messages + [message, result_message]

-    async def _get_tool_defs(
-        self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None
-    ) -> Tuple[List[ToolDefinition], Dict[str, str]]:
+    async def _initialize_tools(
+        self,
+        toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
+    ) -> None:
+        toolgroup_to_args = {}
+        for toolgroup in (self.agent_config.toolgroups or []) + (toolgroups_for_turn or []):
+            if isinstance(toolgroup, AgentToolGroupWithArgs):
+                tool_group_name, _ = self._parse_toolgroup_name(toolgroup.name)
+                toolgroup_to_args[tool_group_name] = toolgroup.args
+
        # Determine which tools to include
        tool_groups_to_include = toolgroups_for_turn or self.agent_config.toolgroups or []
        agent_config_toolgroups = []
@ -816,8 +744,10 @@ class ChatAgent(ShieldRunnerMixin):
            if name not in agent_config_toolgroups:
                agent_config_toolgroups.append(name)

+        toolgroup_to_args = toolgroup_to_args or {}
+
        tool_name_to_def = {}
-        tool_to_group = {}
+        tool_name_to_args = {}

        for tool_def in self.agent_config.client_tools:
            if tool_name_to_def.get(tool_def.name, None):
@ -835,53 +765,38 @@ class ChatAgent(ShieldRunnerMixin):
                    for param in tool_def.parameters
                },
            )
-            tool_to_group[tool_def.name] = "__client_tools__"
        for toolgroup_name_with_maybe_tool_name in agent_config_toolgroups:
-            toolgroup_name, tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name)
+            toolgroup_name, input_tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name)
            tools = await self.tool_groups_api.list_tools(toolgroup_id=toolgroup_name)
            if not tools.data:
                available_tool_groups = ", ".join(
                    [t.identifier for t in (await self.tool_groups_api.list_tool_groups()).data]
                )
                raise ValueError(f"Toolgroup {toolgroup_name} not found, available toolgroups: {available_tool_groups}")
-            if tool_name is not None and not any(tool.identifier == tool_name for tool in tools.data):
+            if input_tool_name is not None and not any(tool.identifier == input_tool_name for tool in tools.data):
                raise ValueError(
-                    f"Tool {tool_name} not found in toolgroup {toolgroup_name}. Available tools: {', '.join([tool.identifier for tool in tools.data])}"
+                    f"Tool {input_tool_name} not found in toolgroup {toolgroup_name}. Available tools: {', '.join([tool.identifier for tool in tools.data])}"
                )

            for tool_def in tools.data:
                if toolgroup_name.startswith("builtin") and toolgroup_name != RAG_TOOL_GROUP:
-                    tool_name = tool_def.identifier
-                    built_in_type = BuiltinTool.brave_search
-                    if tool_name == "web_search":
-                        built_in_type = BuiltinTool.brave_search
+                    identifier: str | BuiltinTool | None = tool_def.identifier
+                    if identifier == "web_search":
+                        identifier = BuiltinTool.brave_search
                    else:
-                        built_in_type = BuiltinTool(tool_name)
+                        identifier = BuiltinTool(identifier)
+                else:
+                    # add if tool_name is unspecified or the tool_def identifier is the same as the tool_name
+                    if input_tool_name in (None, tool_def.identifier):
+                        identifier = tool_def.identifier
+                    else:
+                        identifier = None

-                    if tool_name_to_def.get(built_in_type, None):
-                        raise ValueError(f"Tool {built_in_type} already exists")
-
-                    tool_name_to_def[built_in_type] = ToolDefinition(
-                        tool_name=built_in_type,
-                        description=tool_def.description,
-                        parameters={
-                            param.name: ToolParamDefinition(
-                                param_type=param.parameter_type,
-                                description=param.description,
-                                required=param.required,
-                                default=param.default,
-                            )
-                            for param in tool_def.parameters
-                        },
-                    )
-                    tool_to_group[built_in_type] = tool_def.toolgroup_id
-                    continue
-
-                if tool_name_to_def.get(tool_def.identifier, None):
-                    raise ValueError(f"Tool {tool_def.identifier} already exists")
-                if tool_name in (None, tool_def.identifier):
+                if tool_name_to_def.get(identifier, None):
+                    raise ValueError(f"Tool {identifier} already exists")
+                if identifier:
                    tool_name_to_def[tool_def.identifier] = ToolDefinition(
-                        tool_name=tool_def.identifier,
+                        tool_name=identifier,
                        description=tool_def.description,
                        parameters={
                            param.name: ToolParamDefinition(
@ -893,9 +808,9 @@ class ChatAgent(ShieldRunnerMixin):
                            for param in tool_def.parameters
                        },
                    )
-                    tool_to_group[tool_def.identifier] = tool_def.toolgroup_id
+                    tool_name_to_args[tool_def.identifier] = toolgroup_to_args.get(toolgroup_name, {})

-        return list(tool_name_to_def.values()), tool_to_group
+        self.tool_defs, self.tool_name_to_args = list(tool_name_to_def.values()), tool_name_to_args

    def _parse_toolgroup_name(self, toolgroup_name_with_maybe_tool_name: str) -> tuple[str, Optional[str]]:
        """Parse a toolgroup name into its components.
@ -914,15 +829,46 @@ class ChatAgent(ShieldRunnerMixin):
            tool_group, tool_name = split_names[0], None
        return tool_group, tool_name

+    async def execute_tool_call_maybe(
+        self,
+        session_id: str,
+        tool_call: ToolCall,
+    ) -> ToolInvocationResult:
+        tool_name = tool_call.tool_name
+        registered_tool_names = [tool_def.tool_name for tool_def in self.tool_defs]
+        if tool_name not in registered_tool_names:
+            raise ValueError(
+                f"Tool {tool_name} not found in provided tools, registered tools: {', '.join([str(x) for x in registered_tool_names])}"
+            )
+        if isinstance(tool_name, BuiltinTool):
+            if tool_name == BuiltinTool.brave_search:
+                tool_name_str = WEB_SEARCH_TOOL
+            else:
+                tool_name_str = tool_name.value
+        else:
+            tool_name_str = tool_name
+
+        logger.info(f"executing tool call: {tool_name_str} with args: {tool_call.arguments}")
+        result = await self.tool_runtime_api.invoke_tool(
+            tool_name=tool_name_str,
+            kwargs={
+                "session_id": session_id,
+                # get the arguments generated by the model and augment with toolgroup arg overrides for the agent
+                **tool_call.arguments,
+                **self.tool_name_to_args.get(tool_name_str, {}),
+            },
+        )
+        logger.debug(f"tool call {tool_name_str} completed with result: {result}")
+        return result
+
    async def handle_documents(
        self,
        session_id: str,
        documents: List[Document],
        input_messages: List[Message],
-        tool_defs: Dict[str, ToolDefinition],
    ) -> None:
-        memory_tool = any(tool_def.tool_name == MEMORY_QUERY_TOOL for tool_def in tool_defs)
-        code_interpreter_tool = any(tool_def.tool_name == BuiltinTool.code_interpreter for tool_def in tool_defs)
+        memory_tool = any(tool_def.tool_name == MEMORY_QUERY_TOOL for tool_def in self.tool_defs)
+        code_interpreter_tool = any(tool_def.tool_name == BuiltinTool.code_interpreter for tool_def in self.tool_defs)
        content_items = []
        url_items = []
        pattern = re.compile("^(https?://|file://|data:)")
@ -1032,7 +978,7 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa
            path = urlparse(uri).path
            basename = os.path.basename(path)
            filepath = f"{tempdir}/{make_random_string() + basename}"
-            logcat.info("agents", f"Downloading {url} -> {filepath}")
+            logger.info(f"Downloading {url} -> {filepath}")

            async with httpx.AsyncClient() as client:
                r = await client.get(uri)
@ -1050,42 +996,10 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa

    return ToolResponseMessage(
        call_id="",
-        tool_name=BuiltinTool.code_interpreter,
        content=content,
    )


-async def execute_tool_call_maybe(
-    tool_runtime_api: ToolRuntime,
-    session_id: str,
-    tool_call: ToolCall,
-    toolgroup_args: Dict[str, Dict[str, Any]],
-    tool_to_group: Dict[str, str],
-) -> ToolInvocationResult:
-    name = tool_call.tool_name
-    group_name = tool_to_group.get(name, None)
-    if group_name is None:
-        raise ValueError(f"Tool {name} not found in any tool group")
-    if isinstance(name, BuiltinTool):
-        if name == BuiltinTool.brave_search:
-            name = WEB_SEARCH_TOOL
-        else:
-            name = name.value
-
-    logcat.info("agents", f"executing tool call: {name} with args: {tool_call.arguments}")
-    result = await tool_runtime_api.invoke_tool(
-        tool_name=name,
-        kwargs={
-            "session_id": session_id,
-            # get the arguments generated by the model and augment with toolgroup arg overrides for the agent
-            **tool_call.arguments,
-            **toolgroup_args.get(group_name, {}),
-        },
-    )
-    logcat.debug("agents", f"tool call {name} completed with result: {result}")
-    return result
-
-
 def _interpret_content_as_attachment(
    content: str,
 ) -> Optional[Attachment]:
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -12,6 +12,7 @@ import uuid
 from typing import AsyncGenerator, List, Optional, Union

 from llama_stack.apis.agents import (
+    Agent,
    AgentConfig,
    AgentCreateResponse,
    Agents,
@ -21,12 +22,15 @@ from llama_stack.apis.agents import (
    AgentTurnCreateRequest,
    AgentTurnResumeRequest,
    Document,
+    ListAgentSessionsResponse,
+    ListAgentsResponse,
    Session,
    Turn,
 )
 from llama_stack.apis.inference import (
    Inference,
    ToolConfig,
+    ToolResponse,
    ToolResponseMessage,
    UserMessage,
 )
@ -83,7 +87,7 @@ class MetaReferenceAgentsImpl(Agents):
            agent_id=agent_id,
        )

-    async def get_agent(self, agent_id: str) -> ChatAgent:
+    async def _get_agent_impl(self, agent_id: str) -> ChatAgent:
        agent_config = await self.persistence_store.get(
            key=f"agent:{agent_id}",
        )
@ -119,7 +123,7 @@ class MetaReferenceAgentsImpl(Agents):
        agent_id: str,
        session_name: str,
    ) -> AgentSessionCreateResponse:
-        agent = await self.get_agent(agent_id)
+        agent = await self._get_agent_impl(agent_id)

        session_id = await agent.create_session(session_name)
        return AgentSessionCreateResponse(
@ -140,7 +144,6 @@ class MetaReferenceAgentsImpl(Agents):
        documents: Optional[List[Document]] = None,
        stream: Optional[bool] = False,
        tool_config: Optional[ToolConfig] = None,
-        allow_turn_resume: Optional[bool] = False,
    ) -> AsyncGenerator:
        request = AgentTurnCreateRequest(
            agent_id=agent_id,
@ -150,7 +153,6 @@ class MetaReferenceAgentsImpl(Agents):
            toolgroups=toolgroups,
            documents=documents,
            tool_config=tool_config,
-            allow_turn_resume=allow_turn_resume,
        )
        if stream:
            return self._create_agent_turn_streaming(request)
@ -161,7 +163,7 @@ class MetaReferenceAgentsImpl(Agents):
        self,
        request: AgentTurnCreateRequest,
    ) -> AsyncGenerator:
-        agent = await self.get_agent(request.agent_id)
+        agent = await self._get_agent_impl(request.agent_id)
        async for event in agent.create_and_execute_turn(request):
            yield event

@ -170,7 +172,7 @@ class MetaReferenceAgentsImpl(Agents):
        agent_id: str,
        session_id: str,
        turn_id: str,
-        tool_responses: List[ToolResponseMessage],
+        tool_responses: List[ToolResponse],
        stream: Optional[bool] = False,
    ) -> AsyncGenerator:
        request = AgentTurnResumeRequest(
@ -189,12 +191,12 @@ class MetaReferenceAgentsImpl(Agents):
        self,
        request: AgentTurnResumeRequest,
    ) -> AsyncGenerator:
-        agent = await self.get_agent(request.agent_id)
+        agent = await self._get_agent_impl(request.agent_id)
        async for event in agent.resume_turn(request):
            yield event

    async def get_agents_turn(self, agent_id: str, session_id: str, turn_id: str) -> Turn:
-        agent = await self.get_agent(agent_id)
+        agent = await self._get_agent_impl(agent_id)
        turn = await agent.storage.get_session_turn(session_id, turn_id)
        return turn

@ -211,7 +213,7 @@ class MetaReferenceAgentsImpl(Agents):
        session_id: str,
        turn_ids: Optional[List[str]] = None,
    ) -> Session:
-        agent = await self.get_agent(agent_id)
+        agent = await self._get_agent_impl(agent_id)
        session_info = await agent.storage.get_session_info(session_id)
        if session_info is None:
            raise ValueError(f"Session {session_id} not found")
@ -233,3 +235,15 @@ class MetaReferenceAgentsImpl(Agents):

    async def shutdown(self) -> None:
        pass
+
+    async def list_agents(self) -> ListAgentsResponse:
+        pass
+
+    async def get_agent(self, agent_id: str) -> Agent:
+        pass
+
+    async def list_agent_sessions(
+        self,
+        agent_id: str,
+    ) -> ListAgentSessionsResponse:
+        pass
--- a/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/llama_stack/providers/inline/agents/meta_reference/safety.py
@ -10,6 +10,7 @@ from typing import List

 from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import Safety, SafetyViolation, ViolationLevel
+from llama_stack.providers.utils.telemetry import tracing

 log = logging.getLogger(__name__)

@ -32,15 +33,14 @@ class ShieldRunnerMixin:
        self.output_shields = output_shields

    async def run_multiple_shields(self, messages: List[Message], identifiers: List[str]) -> None:
-        responses = await asyncio.gather(
-            *[
-                self.safety_api.run_shield(
+        async def run_shield_with_span(identifier: str):
+            async with tracing.span(f"run_shield_{identifier}"):
+                return await self.safety_api.run_shield(
                    shield_id=identifier,
                    messages=messages,
                )
-                for identifier in identifiers
-            ]
-        )
+
+        responses = await asyncio.gather(*[run_shield_with_span(identifier) for identifier in identifiers])
        for identifier, response in zip(identifiers, responses, strict=False):
            if not response.violation:
                continue
--- a/llama_stack/providers/inline/agents/meta_reference/tests/test_chat_agent.py
+++ b/llama_stack/providers/inline/agents/meta_reference/tests/test_chat_agent.py
@ -1,400 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import tempfile
-from typing import AsyncIterator, List, Optional, Union
-
-import pytest
-
-from llama_stack.apis.agents import (
-    AgentConfig,
-    AgentToolGroupWithArgs,
-    AgentTurnCreateRequest,
-    AgentTurnResponseTurnCompletePayload,
-    StepType,
-)
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.inference import (
-    ChatCompletionResponse,
-    ChatCompletionResponseEvent,
-    ChatCompletionResponseStreamChunk,
-    CompletionMessage,
-    LogProbConfig,
-    Message,
-    ResponseFormat,
-    SamplingParams,
-    ToolChoice,
-    ToolDefinition,
-    ToolPromptFormat,
-    UserMessage,
-)
-from llama_stack.apis.safety import RunShieldResponse
-from llama_stack.apis.tools import (
-    Tool,
-    ToolDef,
-    ToolGroup,
-    ToolHost,
-    ToolInvocationResult,
-)
-from llama_stack.apis.vector_io import QueryChunksResponse
-from llama_stack.models.llama.datatypes import BuiltinTool
-from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
-    MEMORY_QUERY_TOOL,
-)
-from llama_stack.providers.inline.agents.meta_reference.agents import (
-    MetaReferenceAgentsImpl,
-    MetaReferenceAgentsImplConfig,
-)
-from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
-
-
-class MockInferenceAPI:
-    async def chat_completion(
-        self,
-        model: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = None,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
-        async def stream_response():
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type="start",
-                    delta="",
-                )
-            )
-
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type="progress",
-                    delta="AI is a fascinating field...",
-                )
-            )
-
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type="complete",
-                    delta="",
-                    stop_reason="end_of_turn",
-                )
-            )
-
-        if stream:
-            return stream_response()
-        else:
-            return ChatCompletionResponse(
-                completion_message=CompletionMessage(
-                    role="assistant",
-                    content="Mock response",
-                    stop_reason="end_of_turn",
-                ),
-                logprobs={"token_logprobs": [0.1, 0.2, 0.3]} if logprobs else None,
-            )
-
-
-class MockSafetyAPI:
-    async def run_shield(self, shield_id: str, messages: List[Message]) -> RunShieldResponse:
-        return RunShieldResponse(violation=None)
-
-
-class MockVectorIOAPI:
-    def __init__(self):
-        self.chunks = {}
-
-    async def insert_chunks(self, vector_db_id, chunks, ttl_seconds=None):
-        for chunk in chunks:
-            metadata = chunk.metadata
-            self.chunks[vector_db_id][metadata["document_id"]] = chunk
-
-    async def query_chunks(self, vector_db_id, query, params=None):
-        if vector_db_id not in self.chunks:
-            raise ValueError(f"Bank {vector_db_id} not found")
-
-        chunks = list(self.chunks[vector_db_id].values())
-        scores = [1.0] * len(chunks)
-        return QueryChunksResponse(chunks=chunks, scores=scores)
-
-
-class MockToolGroupsAPI:
-    async def register_tool_group(self, toolgroup_id: str, provider_id: str, mcp_endpoint=None, args=None) -> None:
-        pass
-
-    async def get_tool_group(self, toolgroup_id: str) -> ToolGroup:
-        return ToolGroup(
-            identifier=toolgroup_id,
-            provider_resource_id=toolgroup_id,
-        )
-
-    async def list_tool_groups(self) -> List[ToolGroup]:
-        return []
-
-    async def list_tools(self, tool_group_id: Optional[str] = None) -> List[Tool]:
-        if tool_group_id == MEMORY_TOOLGROUP:
-            return [
-                Tool(
-                    identifier=MEMORY_QUERY_TOOL,
-                    provider_resource_id=MEMORY_QUERY_TOOL,
-                    toolgroup_id=MEMORY_TOOLGROUP,
-                    tool_host=ToolHost.client,
-                    description="Mock tool",
-                    provider_id="builtin::rag",
-                    parameters=[],
-                )
-            ]
-        if tool_group_id == CODE_INTERPRETER_TOOLGROUP:
-            return [
-                Tool(
-                    identifier="code_interpreter",
-                    provider_resource_id="code_interpreter",
-                    toolgroup_id=CODE_INTERPRETER_TOOLGROUP,
-                    tool_host=ToolHost.client,
-                    description="Mock tool",
-                    provider_id="builtin::code_interpreter",
-                    parameters=[],
-                )
-            ]
-        return []
-
-    async def get_tool(self, tool_name: str) -> Tool:
-        return Tool(
-            identifier=tool_name,
-            provider_resource_id=tool_name,
-            toolgroup_id="mock_group",
-            tool_host=ToolHost.client,
-            description="Mock tool",
-            provider_id="mock_provider",
-            parameters=[],
-        )
-
-    async def unregister_tool_group(self, tool_group_id: str) -> None:
-        pass
-
-
-class MockToolRuntimeAPI:
-    async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
-    ) -> List[ToolDef]:
-        return []
-
-    async def invoke_tool(self, tool_name: str, args: dict) -> ToolInvocationResult:
-        return ToolInvocationResult(content={"result": "Mock tool result"})
-
-
-@pytest.fixture
-def mock_inference_api():
-    return MockInferenceAPI()
-
-
-@pytest.fixture
-def mock_safety_api():
-    return MockSafetyAPI()
-
-
-@pytest.fixture
-def mock_vector_io_api():
-    return MockVectorIOAPI()
-
-
-@pytest.fixture
-def mock_tool_groups_api():
-    return MockToolGroupsAPI()
-
-
-@pytest.fixture
-def mock_tool_runtime_api():
-    return MockToolRuntimeAPI()
-
-
-@pytest.fixture
-async def get_agents_impl(
-    mock_inference_api,
-    mock_safety_api,
-    mock_vector_io_api,
-    mock_tool_runtime_api,
-    mock_tool_groups_api,
-):
-    sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
-    impl = MetaReferenceAgentsImpl(
-        config=MetaReferenceAgentsImplConfig(
-            persistence_store=SqliteKVStoreConfig(
-                db_name=sqlite_file.name,
-            ),
-        ),
-        inference_api=mock_inference_api,
-        safety_api=mock_safety_api,
-        vector_io_api=mock_vector_io_api,
-        tool_runtime_api=mock_tool_runtime_api,
-        tool_groups_api=mock_tool_groups_api,
-    )
-    await impl.initialize()
-    return impl
-
-
-@pytest.fixture
-async def get_chat_agent(get_agents_impl):
-    impl = await get_agents_impl
-    agent_config = AgentConfig(
-        model="test_model",
-        instructions="You are a helpful assistant.",
-        toolgroups=[],
-        tool_choice=ToolChoice.auto,
-        enable_session_persistence=False,
-        input_shields=["test_shield"],
-    )
-    response = await impl.create_agent(agent_config)
-    return await impl.get_agent(response.agent_id)
-
-
-MEMORY_TOOLGROUP = "builtin::rag"
-CODE_INTERPRETER_TOOLGROUP = "builtin::code_interpreter"
-
-
-@pytest.fixture
-async def get_chat_agent_with_tools(get_agents_impl, request):
-    impl = await get_agents_impl
-    toolgroups = request.param
-    agent_config = AgentConfig(
-        model="test_model",
-        instructions="You are a helpful assistant.",
-        toolgroups=toolgroups,
-        tool_choice=ToolChoice.auto,
-        enable_session_persistence=False,
-        input_shields=["test_shield"],
-    )
-    response = await impl.create_agent(agent_config)
-    return await impl.get_agent(response.agent_id)
-
-
-@pytest.mark.asyncio
-async def test_chat_agent_create_and_execute_turn(get_chat_agent):
-    chat_agent = await get_chat_agent
-    session_id = await chat_agent.create_session("Test Session")
-    request = AgentTurnCreateRequest(
-        agent_id=chat_agent.agent_id,
-        session_id=session_id,
-        messages=[UserMessage(content="Hello")],
-        stream=True,
-    )
-
-    responses = []
-    async for response in chat_agent.create_and_execute_turn(request):
-        responses.append(response)
-
-    assert len(responses) > 0
-    assert (
-        len(responses) == 7
-    )  # TurnStart, ShieldCallStart, ShieldCallComplete, StepStart, StepProgress, StepComplete, TurnComplete
-    assert responses[0].event.payload.turn_id is not None
-
-
-@pytest.mark.asyncio
-async def test_run_multiple_shields_wrapper(get_chat_agent):
-    chat_agent = await get_chat_agent
-    messages = [UserMessage(content="Test message")]
-    shields = ["test_shield"]
-
-    responses = [
-        chunk
-        async for chunk in chat_agent.run_multiple_shields_wrapper(
-            turn_id="test_turn_id",
-            messages=messages,
-            shields=shields,
-            touchpoint="user-input",
-        )
-    ]
-
-    assert len(responses) == 2  # StepStart, StepComplete
-    assert responses[0].event.payload.step_type.value == "shield_call"
-    assert not responses[1].event.payload.step_details.violation
-
-
-@pytest.mark.asyncio
-async def test_chat_agent_complex_turn(get_chat_agent):
-    chat_agent = await get_chat_agent
-    session_id = await chat_agent.create_session("Test Session")
-    request = AgentTurnCreateRequest(
-        agent_id=chat_agent.agent_id,
-        session_id=session_id,
-        messages=[UserMessage(content="Tell me about AI and then use a tool.")],
-        stream=True,
-    )
-
-    responses = []
-    async for response in chat_agent.create_and_execute_turn(request):
-        responses.append(response)
-
-    assert len(responses) > 0
-
-    step_types = [
-        response.event.payload.step_type for response in responses if hasattr(response.event.payload, "step_type")
-    ]
-
-    assert StepType.shield_call in step_types, "Shield call step is missing"
-    assert StepType.inference in step_types, "Inference step is missing"
-
-    event_types = [
-        response.event.payload.event_type for response in responses if hasattr(response.event.payload, "event_type")
-    ]
-    assert "turn_start" in event_types, "Start event is missing"
-    assert "turn_complete" in event_types, "Complete event is missing"
-
-    assert any(isinstance(response.event.payload, AgentTurnResponseTurnCompletePayload) for response in responses), (
-        "Turn complete event is missing"
-    )
-    turn_complete_payload = next(
-        response.event.payload
-        for response in responses
-        if isinstance(response.event.payload, AgentTurnResponseTurnCompletePayload)
-    )
-    turn = turn_complete_payload.turn
-    assert turn.input_messages == request.messages, "Input messages do not match"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "toolgroups, expected_memory, expected_code_interpreter",
-    [
-        ([], False, False),  # no tools
-        ([MEMORY_TOOLGROUP], True, False),  # memory only
-        ([CODE_INTERPRETER_TOOLGROUP], False, True),  # code interpreter only
-        ([MEMORY_TOOLGROUP, CODE_INTERPRETER_TOOLGROUP], True, True),  # all tools
-    ],
-)
-async def test_chat_agent_tools(get_agents_impl, toolgroups, expected_memory, expected_code_interpreter):
-    impl = await get_agents_impl
-    agent_config = AgentConfig(
-        model="test_model",
-        instructions="You are a helpful assistant.",
-        toolgroups=toolgroups,
-        tool_choice=ToolChoice.auto,
-        enable_session_persistence=False,
-        input_shields=["test_shield"],
-    )
-    response = await impl.create_agent(agent_config)
-    chat_agent = await impl.get_agent(response.agent_id)
-
-    tool_defs, _ = await chat_agent._get_tool_defs()
-    if expected_memory:
-        assert MEMORY_QUERY_TOOL in tool_defs
-    if expected_code_interpreter:
-        assert BuiltinTool.code_interpreter in tool_defs
-    if expected_memory and expected_code_interpreter:
-        # override the tools for turn
-        new_tool_defs, _ = await chat_agent._get_tool_defs(
-            toolgroups_for_turn=[
-                AgentToolGroupWithArgs(
-                    name=MEMORY_TOOLGROUP,
-                    args={"vector_dbs": ["test_vector_db"]},
-                )
-            ]
-        )
-        assert MEMORY_QUERY_TOOL in new_tool_defs
-        assert BuiltinTool.code_interpreter not in new_tool_defs
--- a/llama_stack/providers/inline/datasetio/localfs/init.py
+++ b/llama_stack/providers/inline/datasetio/localfs/init.py
@ -4,12 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from .config import LocalFSDatasetIOConfig


 async def get_provider_impl(
    config: LocalFSDatasetIOConfig,
-    _deps,
+    _deps: Dict[str, Any],
 ):
    from .datasetio import LocalFSDatasetIOImpl

--- a/llama_stack/providers/inline/datasetio/localfs/config.py
+++ b/llama_stack/providers/inline/datasetio/localfs/config.py
@ -3,9 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel

-from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 from llama_stack.providers.utils.kvstore.config import (
    KVStoreConfig,
    SqliteKVStoreConfig,
@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import (


 class LocalFSDatasetIOConfig(BaseModel):
-    kvstore: KVStoreConfig = SqliteKVStoreConfig(
-        db_path=(RUNTIME_BASE_DIR / "localfs_datasetio.db").as_posix()
-    )  # Uses SQLite config specific to localfs storage
+    kvstore: KVStoreConfig
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="localfs_datasetio.db",
+            )
+        }
--- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@ -172,7 +172,7 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
        new_rows_df = dataset_impl._validate_dataset_schema(new_rows_df)
        dataset_impl.df = pandas.concat([dataset_impl.df, new_rows_df], ignore_index=True)

-        url = str(dataset_info.dataset_def.url)
+        url = str(dataset_info.dataset_def.url.uri)
        parsed_url = urlparse(url)

        if parsed_url.scheme == "file" or not parsed_url.scheme:
--- a/llama_stack/providers/inline/eval/meta_reference/init.py
+++ b/llama_stack/providers/inline/eval/meta_reference/init.py
@ -3,16 +3,16 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import MetaReferenceEvalConfig


 async def get_provider_impl(
    config: MetaReferenceEvalConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
    from .eval import MetaReferenceEvalImpl

--- a/llama_stack/providers/inline/eval/meta_reference/config.py
+++ b/llama_stack/providers/inline/eval/meta_reference/config.py
@ -3,9 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel

-from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 from llama_stack.providers.utils.kvstore.config import (
    KVStoreConfig,
    SqliteKVStoreConfig,
@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import (


 class MetaReferenceEvalConfig(BaseModel):
-    kvstore: KVStoreConfig = SqliteKVStoreConfig(
-        db_path=(RUNTIME_BASE_DIR / "meta_reference_eval.db").as_posix()
-    )  # Uses SQLite config specific to Meta Reference Eval storage
+    kvstore: KVStoreConfig
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="meta_reference_eval.db",
+            )
+        }
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -83,7 +83,7 @@ class MetaReferenceEvalImpl(
    async def run_eval(
        self,
        benchmark_id: str,
-        task_config: BenchmarkConfig,
+        benchmark_config: BenchmarkConfig,
    ) -> Job:
        task_def = self.benchmarks[benchmark_id]
        dataset_id = task_def.dataset_id
@ -92,13 +92,13 @@ class MetaReferenceEvalImpl(
        validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.eval.value))
        all_rows = await self.datasetio_api.get_rows_paginated(
            dataset_id=dataset_id,
-            rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
+            rows_in_page=(-1 if benchmark_config.num_examples is None else benchmark_config.num_examples),
        )
        res = await self.evaluate_rows(
            benchmark_id=benchmark_id,
            input_rows=all_rows.rows,
            scoring_functions=scoring_functions,
-            task_config=task_config,
+            benchmark_config=benchmark_config,
        )

        # TODO: currently needs to wait for generation before returning
@ -108,9 +108,9 @@ class MetaReferenceEvalImpl(
        return Job(job_id=job_id)

    async def _run_agent_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
+        self, input_rows: List[Dict[str, Any]], benchmark_config: BenchmarkConfig
    ) -> List[Dict[str, Any]]:
-        candidate = task_config.eval_candidate
+        candidate = benchmark_config.eval_candidate
        create_response = await self.agents_api.create_agent(candidate.config)
        agent_id = create_response.agent_id

@ -151,9 +151,9 @@ class MetaReferenceEvalImpl(
        return generations

    async def _run_model_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
+        self, input_rows: List[Dict[str, Any]], benchmark_config: BenchmarkConfig
    ) -> List[Dict[str, Any]]:
-        candidate = task_config.eval_candidate
+        candidate = benchmark_config.eval_candidate
        assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"

        generations = []
@ -189,13 +189,13 @@ class MetaReferenceEvalImpl(
        benchmark_id: str,
        input_rows: List[Dict[str, Any]],
        scoring_functions: List[str],
-        task_config: BenchmarkConfig,
+        benchmark_config: BenchmarkConfig,
    ) -> EvaluateResponse:
-        candidate = task_config.eval_candidate
+        candidate = benchmark_config.eval_candidate
        if candidate.type == "agent":
-            generations = await self._run_agent_generation(input_rows, task_config)
+            generations = await self._run_agent_generation(input_rows, benchmark_config)
        elif candidate.type == "model":
-            generations = await self._run_model_generation(input_rows, task_config)
+            generations = await self._run_model_generation(input_rows, benchmark_config)
        else:
            raise ValueError(f"Invalid candidate type: {candidate.type}")

@ -204,9 +204,9 @@ class MetaReferenceEvalImpl(
            input_r | generated_r for input_r, generated_r in zip(input_rows, generations, strict=False)
        ]

-        if task_config.scoring_params is not None:
+        if benchmark_config.scoring_params is not None:
            scoring_functions_dict = {
-                scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
+                scoring_fn_id: benchmark_config.scoring_params.get(scoring_fn_id, None)
                for scoring_fn_id in scoring_functions
            }
        else:
--- a/llama_stack/providers/inline/inference/meta_reference/init.py
+++ b/llama_stack/providers/inline/inference/meta_reference/init.py
@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Union
+from typing import Any, Dict, Union

 from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig


 async def get_provider_impl(
    config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig],
-    _deps,
+    _deps: Dict[str, Any],
 ):
    from .inference import MetaReferenceInferenceImpl

--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -136,11 +136,13 @@ class MetaReferenceInferenceImpl(
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        sampling_params: Optional[SamplingParams] = None,
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
+        if sampling_params is None:
+            sampling_params = SamplingParams()
        if logprobs:
            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"

@ -244,7 +246,7 @@ class MetaReferenceInferenceImpl(
        self,
        model_id: str,
        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        sampling_params: Optional[SamplingParams] = None,
        response_format: Optional[ResponseFormat] = None,
        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
@ -253,6 +255,8 @@ class MetaReferenceInferenceImpl(
        logprobs: Optional[LogProbConfig] = None,
        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
+        if sampling_params is None:
+            sampling_params = SamplingParams()
        if logprobs:
            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"

--- a/llama_stack/providers/inline/inference/sentence_transformers/init.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/init.py
@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from llama_stack.providers.inline.inference.sentence_transformers.config import (
    SentenceTransformersInferenceConfig,
 )
@ -11,7 +13,7 @@ from llama_stack.providers.inline.inference.sentence_transformers.config import

 async def get_provider_impl(
    config: SentenceTransformersInferenceConfig,
-    _deps,
+    _deps: Dict[str, Any],
 ):
    from .sentence_transformers import SentenceTransformersInferenceImpl

--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -53,7 +53,7 @@ class SentenceTransformersInferenceImpl(
        self,
        model_id: str,
        content: str,
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        sampling_params: Optional[SamplingParams] = None,
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
@ -64,7 +64,7 @@ class SentenceTransformersInferenceImpl(
        self,
        model_id: str,
        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        sampling_params: Optional[SamplingParams] = None,
        response_format: Optional[ResponseFormat] = None,
        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
--- a/llama_stack/providers/inline/inference/vllm/init.py
+++ b/llama_stack/providers/inline/inference/vllm/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any
+from typing import Any, Dict

 from .config import VLLMConfig


-async def get_provider_impl(config: VLLMConfig, _deps) -> Any:
+async def get_provider_impl(config: VLLMConfig, _deps: Dict[str, Any]):
    from .vllm import VLLMInferenceImpl

    impl = VLLMInferenceImpl(config)
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@ -4,20 +4,21 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from pydantic import BaseModel, Field, field_validator
+from typing import Any, Dict
+
+from pydantic import BaseModel, Field

-from llama_stack.providers.utils.inference import supported_inference_models
 from llama_stack.schema_utils import json_schema_type


@json_schema_type
 class VLLMConfig(BaseModel):
-    """Configuration for the vLLM inference provider."""
+    """Configuration for the vLLM inference provider.
+
+    Note that the model name is no longer part of this static configuration.
+    You can bind an instance of this provider to a specific model with the
+    ``models.register()`` API call."""

-    model: str = Field(
-        default="Llama3.2-3B-Instruct",
-        description="Model descriptor from `llama model list`",
-    )
    tensor_parallel_size: int = Field(
        default=1,
        description="Number of tensor parallel replicas (number of GPUs to use).",
@ -26,32 +27,27 @@ class VLLMConfig(BaseModel):
        default=4096,
        description="Maximum number of tokens to generate.",
    )
+    max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.")
+    max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation.")
    enforce_eager: bool = Field(
        default=False,
        description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
    )
    gpu_memory_utilization: float = Field(
        default=0.3,
+        description=(
+            "How much GPU memory will be allocated when this provider has finished "
+            "loading, including memory that was already allocated before loading."
+        ),
    )

    @classmethod
-    def sample_run_config(cls):
+    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
        return {
-            "model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
            "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
            "max_tokens": "${env.MAX_TOKENS:4096}",
+            "max_model_len": "${env.MAX_MODEL_LEN:4096}",
+            "max_num_seqs": "${env.MAX_NUM_SEQS:4}",
            "enforce_eager": "${env.ENFORCE_EAGER:False}",
-            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
+            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.3}",
        }
-
-    @field_validator("model")
-    @classmethod
-    def validate_model(cls, model: str) -> str:
-        permitted_models = supported_inference_models()
-
-        descriptors = [m.descriptor() for m in permitted_models]
-        repos = [m.huggingface_repo for m in permitted_models]
-        if model not in (descriptors + repos):
-            model_list = "\n\t".join(repos)
-            raise ValueError(f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]")
-        return model
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@ -4,45 +4,71 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import logging
-import os
+import json
+import re
 import uuid
-from typing import AsyncGenerator, List, Optional
+from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional, Union

+# These vLLM modules contain names that overlap with Llama Stack names, so we import
+# fully-qualified names
+import vllm.entrypoints.openai.protocol
+import vllm.sampling_params
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams as VLLMSamplingParams
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels

-from llama_stack.apis.common.content_types import InterleavedContent
+from llama_stack.apis.common.content_types import (
+    InterleavedContent,
+    InterleavedContentItem,
+    TextDelta,
+    ToolCallDelta,
+)
 from llama_stack.apis.inference import (
    ChatCompletionRequest,
    ChatCompletionResponse,
+    ChatCompletionResponseEvent,
+    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
+    CompletionMessage,
    CompletionResponse,
    CompletionResponseStreamChunk,
    EmbeddingsResponse,
    EmbeddingTaskType,
+    GrammarResponseFormat,
    Inference,
-    InterleavedContentItem,
+    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
    ResponseFormat,
    SamplingParams,
    TextTruncation,
+    TokenLogProbs,
    ToolChoice,
    ToolConfig,
-    ToolDefinition,
-    ToolPromptFormat,
 )
 from llama_stack.apis.models import Model
+from llama_stack.log import get_logger
+from llama_stack.models.llama import sku_list
+from llama_stack.models.llama.datatypes import (
+    StopReason,
+    ToolCall,
+    ToolDefinition,
+    ToolPromptFormat,
+    TopKSamplingStrategy,
+    TopPSamplingStrategy,
+)
+from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.models.llama.sku_list import resolve_model
-from llama_stack.providers.datatypes import ModelsProtocolPrivate
+from llama_stack.providers.remote.inference.vllm.vllm import build_hf_repo_model_entries
+from llama_stack.providers.utils.inference.model_registry import (
+    ModelRegistryHelper,
+    ModelsProtocolPrivate,
+)
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
-    get_sampling_options,
-    process_chat_completion_response,
+    get_stop_reason,
    process_chat_completion_stream_response,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
@ -50,188 +76,322 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 )

 from .config import VLLMConfig
+from .openai_utils import llama_stack_chat_completion_to_openai_chat_completion_dict

-log = logging.getLogger(__name__)
+# Map from Hugging Face model architecture name to appropriate tool parser.
+# See vllm.entrypoints.openai.tool_parsers.ToolParserManager.tool_parsers for the full list of
+# available parsers.
+# TODO: Expand this list
+CONFIG_TYPE_TO_TOOL_PARSER = {
+    "GraniteConfig": "granite",
+    "MllamaConfig": "llama3_json",
+    "LlamaConfig": "llama3_json",
+}
+DEFAULT_TOOL_PARSER = "pythonic"


-def _random_uuid() -> str:
+logger = get_logger(__name__, category="inference")
+
+
+def _random_uuid_str() -> str:
    return str(uuid.uuid4().hex)


+def _response_format_to_guided_decoding_params(
+    response_format: Optional[ResponseFormat],  # type: ignore
+) -> vllm.sampling_params.GuidedDecodingParams:
+    """
+    Translate constrained decoding parameters from Llama Stack's format to vLLM's format.
+
+    :param response_format: Llama Stack version of constrained decoding info. Can be ``None``,
+     indicating no constraints.
+    :returns: The equivalent dataclass object for the low-level inference layer of vLLM.
+    """
+    if response_format is None:
+        # As of vLLM 0.6.3, the default constructor for GuidedDecodingParams() returns an invalid
+        # value that crashes the executor on some code paths. Use ``None`` instead.
+        return None
+
+    # Llama Stack currently implements fewer types of constrained decoding than vLLM does.
+    # Translate the types that exist and detect if Llama Stack adds new ones.
+    if isinstance(response_format, JsonSchemaResponseFormat):
+        return vllm.sampling_params.GuidedDecodingParams(json=response_format.json_schema)
+    elif isinstance(response_format, GrammarResponseFormat):
+        # BNF grammar.
+        # Llama Stack uses the parse tree of the grammar, while vLLM uses the string
+        # representation of the grammar.
+        raise TypeError(
+            "Constrained decoding with BNF grammars is not currently implemented, because the "
+            "reference implementation does not implement it."
+        )
+    else:
+        raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(response_format)}'")
+
+
+def _convert_sampling_params(
+    sampling_params: Optional[SamplingParams],
+    response_format: Optional[ResponseFormat],  # type: ignore
+    log_prob_config: Optional[LogProbConfig],
+) -> vllm.SamplingParams:
+    """Convert sampling and constrained decoding configuration from Llama Stack's format to vLLM's
+    format."""
+    # In the absence of provided config values, use Llama Stack defaults as encoded in the Llama
+    # Stack dataclasses. These defaults are different from vLLM's defaults.
+    if sampling_params is None:
+        sampling_params = SamplingParams()
+    if log_prob_config is None:
+        log_prob_config = LogProbConfig()
+
+    if isinstance(sampling_params.strategy, TopKSamplingStrategy):
+        if sampling_params.strategy.top_k == 0:
+            # vLLM treats "k" differently for top-k sampling
+            vllm_top_k = -1
+        else:
+            vllm_top_k = sampling_params.strategy.top_k
+    else:
+        vllm_top_k = -1
+
+    if isinstance(sampling_params.strategy, TopPSamplingStrategy):
+        vllm_top_p = sampling_params.strategy.top_p
+        # Llama Stack only allows temperature with top-P.
+        vllm_temperature = sampling_params.strategy.temperature
+    else:
+        vllm_top_p = 1.0
+        vllm_temperature = 0.0
+
+    # vLLM allows top-p and top-k at the same time.
+    vllm_sampling_params = vllm.SamplingParams.from_optional(
+        max_tokens=(None if sampling_params.max_tokens == 0 else sampling_params.max_tokens),
+        temperature=vllm_temperature,
+        top_p=vllm_top_p,
+        top_k=vllm_top_k,
+        repetition_penalty=sampling_params.repetition_penalty,
+        guided_decoding=_response_format_to_guided_decoding_params(response_format),
+        logprobs=log_prob_config.top_k,
+    )
+    return vllm_sampling_params
+
+
 class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
-    """Inference implementation for vLLM."""
+    """
+    vLLM-based inference model adapter for Llama Stack with support for multiple models.
+
+    Requires the configuration parameters documented in the :class:`VllmConfig2` class.
+    """
+
+    config: VLLMConfig
+    register_helper: ModelRegistryHelper
+    model_ids: set[str]
+    resolved_model_id: str | None
+    engine: AsyncLLMEngine | None
+    chat: OpenAIServingChat | None
+    is_meta_llama_model: bool

    def __init__(self, config: VLLMConfig):
        self.config = config
+        logger.info(f"Config is: {self.config}")
+
+        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
+        self.formatter = ChatFormat(Tokenizer.get_instance())
+
+        # The following are initialized when paths are bound to this provider
+        self.resolved_model_id = None
+        self.model_ids = set()
        self.engine = None
+        self.chat = None
+        self.is_meta_llama_model = False

-    async def initialize(self):
-        log.info("Initializing vLLM inference provider.")
+    ###########################################################################
+    # METHODS INHERITED FROM IMPLICIT BASE CLASS.
+    # TODO: Make this class inherit from the new base class ProviderBase once that class exists.

-        # Disable usage stats reporting. This would be a surprising thing for most
-        # people to find out was on by default.
-        # https://docs.vllm.ai/en/latest/serving/usage_stats.html
-        if "VLLM_NO_USAGE_STATS" not in os.environ:
-            os.environ["VLLM_NO_USAGE_STATS"] = "1"
+    async def initialize(self) -> None:
+        """
+        Callback that is invoked through many levels of indirection during provider class
+        instantiation, sometime after when __init__() is called and before any model registration
+        methods or methods connected to a REST API are called.

-        model = resolve_model(self.config.model)
-        if model is None:
-            raise ValueError(f"Unknown model {self.config.model}")
+        It's not clear what assumptions the class can make about the platform's initialization
+        state here that can't be made during __init__(), and vLLM can't be started until we know
+        what model it's supposed to be serving, so nothing happens here currently.
+        """
+        pass

-        if model.huggingface_repo is None:
-            raise ValueError(f"Model {self.config.model} needs a huggingface repo")
-
-        # TODO -- there are a ton of options supported here ...
-        engine_args = AsyncEngineArgs(
-            model=model.huggingface_repo,
-            tokenizer=model.huggingface_repo,
-            tensor_parallel_size=self.config.tensor_parallel_size,
-            enforce_eager=self.config.enforce_eager,
-            gpu_memory_utilization=self.config.gpu_memory_utilization,
-            guided_decoding_backend="lm-format-enforcer",
-        )
-
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
-
-    async def shutdown(self):
-        """Shut down the vLLM inference adapter."""
-        log.info("Shutting down vLLM inference provider.")
-        if self.engine:
+    async def shutdown(self) -> None:
+        logger.info(f"Shutting down inline vLLM inference provider {self}.")
+        if self.engine is not None:
            self.engine.shutdown_background_loop()
+            self.engine = None
+            self.chat = None
+            self.model_ids = set()
+            self.resolved_model_id = None
+
+    ###########################################################################
+    # METHODS INHERITED FROM ModelsProtocolPrivate INTERFACE

    # Note that the return type of the superclass method is WRONG
    async def register_model(self, model: Model) -> Model:
        """
-        Callback that is called when the server associates an inference endpoint
-        with an inference provider.
+        Callback that is called when the server associates an inference endpoint with an
+        inference provider.

-        :param model: Object that encapsulates parameters necessary for identifying
-         a specific LLM.
+        :param model: Object that encapsulates parameters necessary for identifying a specific
+         LLM.

-        :returns: The input ``Model`` object. It may or may not be permissible
-         to change fields before returning this object.
+        :returns: The input ``Model`` object. It may or may not be permissible to change fields
+         before returning this object.
        """
-        log.info(f"Registering model {model.identifier} with vLLM inference provider.")
-        # The current version of this provided is hard-coded to serve only
-        # the model specified in the YAML config file.
-        configured_model = resolve_model(self.config.model)
-        registered_model = resolve_model(model.model_id)
+        logger.debug(f"In register_model({model})")
+
+        # First attempt to interpret the model coordinates as a Llama model name
+        resolved_llama_model = sku_list.resolve_model(model.provider_model_id)
+        if resolved_llama_model is not None:
+            # Load from Hugging Face repo into default local cache dir
+            model_id_for_vllm = resolved_llama_model.huggingface_repo
+
+            # Detect a genuine Meta Llama model to trigger Meta-specific preprocessing.
+            # Don't set self.is_meta_llama_model until we actually load the model.
+            is_meta_llama_model = True
+        else:  # if resolved_llama_model is None
+            # Not a Llama model name. Pass the model id through to vLLM's loader
+            model_id_for_vllm = model.provider_model_id
+            is_meta_llama_model = False
+
+        if self.resolved_model_id is not None:
+            if model_id_for_vllm != self.resolved_model_id:
+                raise ValueError(
+                    f"Attempted to serve two LLMs (ids '{self.resolved_model_id}') and "
+                    f"'{model_id_for_vllm}') from one copy of provider '{self}'. Use multiple "
+                    f"copies of the provider instead."
+                )
+            else:
+                # Model already loaded
+                logger.info(
+                    f"Requested id {model} resolves to {model_id_for_vllm}, which is already loaded. Continuing."
+                )
+                self.model_ids.add(model.model_id)
+                return model
+
+        logger.info(f"Requested id {model} resolves to {model_id_for_vllm}. Loading {model_id_for_vllm}.")
+        if is_meta_llama_model:
+            logger.info(f"Model {model_id_for_vllm} is a Meta Llama model.")
+        self.is_meta_llama_model = is_meta_llama_model
+
+        # If we get here, this is the first time registering a model.
+        # Preload so that the first inference request won't time out.
+        engine_args = AsyncEngineArgs(
+            model=model_id_for_vllm,
+            tokenizer=model_id_for_vllm,
+            tensor_parallel_size=self.config.tensor_parallel_size,
+            enforce_eager=self.config.enforce_eager,
+            gpu_memory_utilization=self.config.gpu_memory_utilization,
+            max_num_seqs=self.config.max_num_seqs,
+            max_model_len=self.config.max_model_len,
+        )
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+        # vLLM currently requires the user to specify the tool parser manually. To choose a tool
+        # parser, we need to determine what model architecture is being used. For now, we infer
+        # that information from what config class the model uses.
+        low_level_model_config = self.engine.engine.get_model_config()
+        hf_config = low_level_model_config.hf_config
+        hf_config_class_name = hf_config.__class__.__name__
+        if hf_config_class_name in CONFIG_TYPE_TO_TOOL_PARSER:
+            tool_parser = CONFIG_TYPE_TO_TOOL_PARSER[hf_config_class_name]
+        else:
+            # No info -- choose a default so we can at least attempt tool
+            # use.
+            tool_parser = DEFAULT_TOOL_PARSER
+        logger.debug(f"{hf_config_class_name=}")
+        logger.debug(f"{tool_parser=}")
+
+        # Wrap the lower-level engine in an OpenAI-compatible chat API
+        model_config = await self.engine.get_model_config()
+        self.chat = OpenAIServingChat(
+            engine_client=self.engine,
+            model_config=model_config,
+            models=OpenAIServingModels(
+                engine_client=self.engine,
+                model_config=model_config,
+                base_model_paths=[
+                    # The layer below us will only see resolved model IDs
+                    BaseModelPath(model_id_for_vllm, model_id_for_vllm)
+                ],
+            ),
+            response_role="assistant",
+            request_logger=None,  # Use default logging
+            chat_template=None,  # Use default template from model checkpoint
+            enable_auto_tools=True,
+            tool_parser=tool_parser,
+            chat_template_content_format="auto",
+        )
+        self.resolved_model_id = model_id_for_vllm
+        self.model_ids.add(model.model_id)
+
+        logger.info(f"Finished preloading model: {model_id_for_vllm}")

-        if configured_model.core_model_id != registered_model.core_model_id:
-            raise ValueError(
-                f"Requested model '{model.identifier}' is different from "
-                f"model '{self.config.model}' that this provider "
-                f"is configured to serve"
-            )
        return model

-    def _sampling_params(self, sampling_params: SamplingParams) -> VLLMSamplingParams:
-        if sampling_params is None:
-            return VLLMSamplingParams(max_tokens=self.config.max_tokens)
-
-        options = get_sampling_options(sampling_params)
-        if "repeat_penalty" in options:
-            options["repetition_penalty"] = options["repeat_penalty"]
-            del options["repeat_penalty"]
-
-        return VLLMSamplingParams(**options)
-
    async def unregister_model(self, model_id: str) -> None:
-        pass
+        """
+        Callback that is called when the server removes an inference endpoint from an inference
+        provider.
+
+        :param model_id: The same external ID that the higher layers of the stack previously passed
+        to :func:`register_model()`
+        """
+        if model_id not in self.model_ids:
+            raise ValueError(
+                f"Attempted to unregister model ID '{model_id}', but that ID is not registered to this provider."
+            )
+        self.model_ids.remove(model_id)
+
+        if len(self.model_ids) == 0:
+            # Last model was just unregistered. Shut down the connection to vLLM and free up
+            # resources.
+            # Note that this operation may cause in-flight chat completion requests on the
+            # now-unregistered model to return errors.
+            self.resolved_model_id = None
+            self.chat = None
+            self.engine.shutdown_background_loop()
+            self.engine = None
+
+    ###########################################################################
+    # METHODS INHERITED FROM Inference INTERFACE

    async def completion(
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        sampling_params: Optional[SamplingParams] = None,
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
-    ) -> CompletionResponse | CompletionResponseStreamChunk:
-        raise NotImplementedError("Completion not implemented for vLLM")
+    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        if model_id not in self.model_ids:
+            raise ValueError(
+                f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}"
+            )
+        if not isinstance(content, str):
+            raise NotImplementedError("Multimodal input not currently supported")
+        if sampling_params is None:
+            sampling_params = SamplingParams()

-    async def chat_completion(
-        self,
-        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
-        assert self.engine is not None
+        converted_sampling_params = _convert_sampling_params(sampling_params, response_format, logprobs)

-        request = ChatCompletionRequest(
-            model=model_id,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            stream=stream,
-            logprobs=logprobs,
-            tool_config=tool_config,
-        )
+        logger.debug(f"{converted_sampling_params=}")

-        log.info("Sampling params: %s", sampling_params)
-        request_id = _random_uuid()
-
-        prompt = await chat_completion_request_to_prompt(request, self.config.model)
-        vllm_sampling_params = self._sampling_params(request.sampling_params)
-        results_generator = self.engine.generate(prompt, vllm_sampling_params, request_id)
        if stream:
-            return self._stream_chat_completion(request, results_generator)
+            return self._streaming_completion(content, converted_sampling_params)
        else:
-            return await self._nonstream_chat_completion(request, results_generator)
-
-    async def _nonstream_chat_completion(
-        self, request: ChatCompletionRequest, results_generator: AsyncGenerator
-    ) -> ChatCompletionResponse:
-        outputs = [o async for o in results_generator]
-        final_output = outputs[-1]
-
-        assert final_output is not None
-        outputs = final_output.outputs
-        finish_reason = outputs[-1].stop_reason
-        choice = OpenAICompatCompletionChoice(
-            finish_reason=finish_reason,
-            text="".join([output.text for output in outputs]),
-        )
-        response = OpenAICompatCompletionResponse(
-            choices=[choice],
-        )
-        return process_chat_completion_response(response, request)
-
-    async def _stream_chat_completion(
-        self, request: ChatCompletionRequest, results_generator: AsyncGenerator
-    ) -> AsyncGenerator:
-        tokenizer = Tokenizer.get_instance()
-
-        async def _generate_and_convert_to_openai_compat():
-            cur = []
-            async for chunk in results_generator:
-                if not chunk.outputs:
-                    log.warning("Empty chunk received")
-                    continue
-
-                output = chunk.outputs[-1]
-
-                new_tokens = output.token_ids[len(cur) :]
-                text = tokenizer.decode(new_tokens)
-                cur.extend(new_tokens)
-                choice = OpenAICompatCompletionChoice(
-                    finish_reason=output.finish_reason,
-                    text=text,
-                )
-                yield OpenAICompatCompletionResponse(
-                    choices=[choice],
-                )
-
-        stream = _generate_and_convert_to_openai_compat()
-        async for chunk in process_chat_completion_stream_response(stream, request):
-            yield chunk
+            streaming_result = None
+            async for _ in self._streaming_completion(content, converted_sampling_params):
+                pass
+            return CompletionResponse(
+                content=streaming_result.delta,
+                stop_reason=streaming_result.stop_reason,
+                logprobs=streaming_result.logprobs,
+            )

    async def embeddings(
        self,
@ -242,3 +402,391 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
        task_type: Optional[EmbeddingTaskType] = None,
    ) -> EmbeddingsResponse:
        raise NotImplementedError()
+
+    async def chat_completion(
+        self,
+        model_id: str,
+        messages: List[Message],  # type: ignore
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,  # type: ignore
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
+    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
+        sampling_params = sampling_params or SamplingParams()
+        if model_id not in self.model_ids:
+            raise ValueError(
+                f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}"
+            )
+
+        # Convert to Llama Stack internal format for consistency
+        request = ChatCompletionRequest(
+            model=self.resolved_model_id,
+            messages=messages,
+            sampling_params=sampling_params,
+            response_format=response_format,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        if self.is_meta_llama_model:
+            # Bypass vLLM chat templating layer for Meta Llama models, because the
+            # templating layer in Llama Stack currently produces better results.
+            logger.debug(
+                f"Routing {self.resolved_model_id} chat completion through "
+                f"Llama Stack's templating layer instead of vLLM's."
+            )
+            return await self._chat_completion_for_meta_llama(request)
+
+        logger.debug(f"{self.resolved_model_id} is not a Meta Llama model")
+
+        # Arguments to the vLLM call must be packaged as a ChatCompletionRequest dataclass.
+        # Note that this dataclass has the same name as a similar dataclass in Llama Stack.
+        request_options = await llama_stack_chat_completion_to_openai_chat_completion_dict(request)
+        chat_completion_request = vllm.entrypoints.openai.protocol.ChatCompletionRequest(**request_options)
+
+        logger.debug(f"Converted request: {chat_completion_request}")
+
+        vllm_result = await self.chat.create_chat_completion(chat_completion_request)
+        logger.debug(f"Result from vLLM: {vllm_result}")
+        if isinstance(vllm_result, vllm.entrypoints.openai.protocol.ErrorResponse):
+            raise ValueError(f"Error from vLLM layer: {vllm_result}")
+
+        # Return type depends on "stream" argument
+        if stream:
+            if not isinstance(vllm_result, AsyncGenerator):
+                raise TypeError(f"Unexpected result type {type(vllm_result)} for streaming inference call")
+            # vLLM client returns a stream of strings, which need to be parsed.
+            # Stream comes in the form of an async generator.
+            return self._convert_streaming_results(vllm_result)
+        else:
+            if not isinstance(vllm_result, vllm.entrypoints.openai.protocol.ChatCompletionResponse):
+                raise TypeError(f"Unexpected result type {type(vllm_result)} for non-streaming inference call")
+            return self._convert_non_streaming_results(vllm_result)
+
+    ###########################################################################
+    # INTERNAL METHODS
+
+    async def _streaming_completion(
+        self, content: str, sampling_params: vllm.SamplingParams
+    ) -> AsyncIterator[CompletionResponseStreamChunk]:
+        """Internal implementation of :func:`completion()` API for the streaming case. Assumes
+        that arguments have been validated upstream.
+
+        :param content: Must be a string
+        :param sampling_params: Paramters from  public API's ``response_format``
+         and ``sampling_params`` arguments, converted to VLLM format
+        """
+        # We run agains the vLLM generate() call directly instead of using the OpenAI-compatible
+        # layer, because doing so simplifies the code here.
+
+        # The vLLM engine requires a unique identifier for each call to generate()
+        request_id = _random_uuid_str()
+
+        # The vLLM generate() API is streaming-only and returns an async generator.
+        # The generator returns objects of type vllm.RequestOutput.
+        results_generator = self.engine.generate(content, sampling_params, request_id)
+
+        # Need to know the model's EOS token ID for the conversion code below.
+        # AsyncLLMEngine is a wrapper around LLMEngine, and the tokenizer is only available if
+        # we drill down to the LLMEngine inside the AsyncLLMEngine.
+        # Similarly, the tokenizer in an LLMEngine is a wrapper around a BaseTokenizerGroup,
+        # and we need to drill down to the Hugging Face tokenizer inside the BaseTokenizerGroup.
+        llm_engine = self.engine.engine
+        tokenizer_group = llm_engine.tokenizer
+        eos_token_id = tokenizer_group.tokenizer.eos_token_id
+
+        request_output: vllm.RequestOutput = None
+        async for request_output in results_generator:
+            # Check for weird inference failures
+            if request_output.outputs is None or len(request_output.outputs) == 0:
+                # This case also should never happen
+                raise ValueError("Inference produced empty result")
+
+            # If we get here, then request_output contains the final output of the generate() call.
+            # The result may include multiple alternate outputs, but Llama Stack APIs only allow
+            # us to return one.
+            output: vllm.CompletionOutput = request_output.outputs[0]
+            completion_string = output.text
+
+            # Convert logprobs from vLLM's format to Llama Stack's format
+            logprobs = [
+                TokenLogProbs(logprobs_by_token={v.decoded_token: v.logprob for _, v in logprob_dict.items()})
+                for logprob_dict in output.logprobs
+            ]
+
+            # The final output chunk should be labeled with the reason that the overall generate()
+            # call completed.
+            logger.debug(f"{output.stop_reason=}; {type(output.stop_reason)=}")
+            if output.stop_reason is None:
+                stop_reason = None  # Still going
+            elif output.stop_reason == "stop":
+                stop_reason = StopReason.end_of_turn
+            elif output.stop_reason == "length":
+                stop_reason = StopReason.out_of_tokens
+            elif isinstance(output.stop_reason, int):
+                # If the model config specifies multiple end-of-sequence tokens, then vLLM
+                # will return the token ID of the EOS token in the stop_reason field.
+                stop_reason = StopReason.end_of_turn
+            else:
+                raise ValueError(f"Unrecognized stop reason '{output.stop_reason}'")
+
+            # vLLM's protocol outputs the stop token, then sets end of message on the next step for
+            # some reason.
+            if request_output.outputs[-1].token_ids[-1] == eos_token_id:
+                stop_reason = StopReason.end_of_message
+
+            yield CompletionResponseStreamChunk(delta=completion_string, stop_reason=stop_reason, logprobs=logprobs)
+
+        # Llama Stack requires that the last chunk have a stop reason, but vLLM doesn't always
+        # provide one if it runs out of tokens.
+        if stop_reason is None:
+            yield CompletionResponseStreamChunk(
+                delta=completion_string,
+                stop_reason=StopReason.out_of_tokens,
+                logprobs=logprobs,
+            )
+
+    def _convert_non_streaming_results(
+        self, vllm_result: vllm.entrypoints.openai.protocol.ChatCompletionResponse
+    ) -> ChatCompletionResponse:
+        """
+        Subroutine to convert the non-streaming output of vLLM's OpenAI-compatible API into an
+        equivalent Llama Stack object.
+
+        The result from vLLM's non-streaming API is a dataclass with the same name as the Llama
+        Stack ChatCompletionResponse dataclass, but with more and different field names. We ignore
+        the fields that aren't currently present in the Llama Stack dataclass.
+        """
+
+        # There may be multiple responses, but we can only pass through the first one.
+        if len(vllm_result.choices) == 0:
+            raise ValueError("Don't know how to convert response object without any responses")
+        vllm_message = vllm_result.choices[0].message
+        vllm_finish_reason = vllm_result.choices[0].finish_reason
+
+        converted_message = CompletionMessage(
+            role=vllm_message.role,
+            # Llama Stack API won't accept None for content field.
+            content=("" if vllm_message.content is None else vllm_message.content),
+            stop_reason=get_stop_reason(vllm_finish_reason),
+            tool_calls=[
+                ToolCall(
+                    call_id=t.id,
+                    tool_name=t.function.name,
+                    # vLLM function args come back as a string. Llama Stack expects JSON.
+                    arguments=json.loads(t.function.arguments),
+                )
+                for t in vllm_message.tool_calls
+            ],
+        )
+
+        # TODO: Convert logprobs
+
+        logger.debug(f"Converted message: {converted_message}")
+
+        return ChatCompletionResponse(
+            completion_message=converted_message,
+        )
+
+    async def _chat_completion_for_meta_llama(
+        self, request: ChatCompletionRequest
+    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        """
+        Subroutine that routes chat completions for Meta Llama models through Llama Stack's
+        chat template instead of using vLLM's version of that template. The Llama Stack version
+        of the chat template currently produces more reliable outputs.
+
+        Once vLLM's support for Meta Llama models has matured more, we should consider routing
+        Meta Llama requests through the vLLM chat completions API instead of using this method.
+        """
+        formatter = ChatFormat(Tokenizer.get_instance())
+
+        # Note that this function call modifies `request` in place.
+        prompt = await chat_completion_request_to_prompt(request, self.resolved_model_id)
+
+        model_id = list(self.model_ids)[0]  # Any model ID will do here
+        completion_response_or_iterator = await self.completion(
+            model_id=model_id,
+            content=prompt,
+            sampling_params=request.sampling_params,
+            response_format=request.response_format,
+            stream=request.stream,
+            logprobs=request.logprobs,
+        )
+
+        if request.stream:
+            if not isinstance(completion_response_or_iterator, AsyncIterator):
+                raise TypeError(
+                    f"Received unexpected result type {type(completion_response_or_iterator)}for streaming request."
+                )
+            return self._chat_completion_for_meta_llama_streaming(completion_response_or_iterator, request)
+
+        # elsif not request.stream:
+        if not isinstance(completion_response_or_iterator, CompletionResponse):
+            raise TypeError(
+                f"Received unexpected result type {type(completion_response_or_iterator)}for non-streaming request."
+            )
+        completion_response: CompletionResponse = completion_response_or_iterator
+        raw_message = formatter.decode_assistant_message_from_content(
+            completion_response.content, completion_response.stop_reason
+        )
+        return ChatCompletionResponse(
+            completion_message=CompletionMessage(
+                content=raw_message.content,
+                stop_reason=raw_message.stop_reason,
+                tool_calls=raw_message.tool_calls,
+            ),
+            logprobs=completion_response.logprobs,
+        )
+
+    async def _chat_completion_for_meta_llama_streaming(
+        self, results_iterator: AsyncIterator, request: ChatCompletionRequest
+    ) -> AsyncIterator:
+        """
+        Code from :func:`_chat_completion_for_meta_llama()` that needs to be a separate
+        method to keep asyncio happy.
+        """
+
+        # Convert to OpenAI format, then use shared code to convert to Llama Stack format.
+        async def _generate_and_convert_to_openai_compat():
+            chunk: CompletionResponseStreamChunk  # Make Pylance happy
+            last_text_len = 0
+            async for chunk in results_iterator:
+                if chunk.stop_reason == StopReason.end_of_turn:
+                    finish_reason = "stop"
+                elif chunk.stop_reason == StopReason.end_of_message:
+                    finish_reason = "eos"
+                elif chunk.stop_reason == StopReason.out_of_tokens:
+                    finish_reason = "length"
+                else:
+                    finish_reason = None
+
+                # Convert delta back to an actual delta
+                text_delta = chunk.delta[last_text_len:]
+                last_text_len = len(chunk.delta)
+
+                logger.debug(f"{text_delta=}; {finish_reason=}")
+
+                yield OpenAICompatCompletionResponse(
+                    choices=[OpenAICompatCompletionChoice(finish_reason=finish_reason, text=text_delta)]
+                )
+
+        stream = _generate_and_convert_to_openai_compat()
+        async for chunk in process_chat_completion_stream_response(stream, request):
+            logger.debug(f"Returning chunk: {chunk}")
+            yield chunk
+
+    async def _convert_streaming_results(self, vllm_result: AsyncIterator) -> AsyncIterator:
+        """
+        Subroutine that wraps the streaming outputs of vLLM's OpenAI-compatible
+        API into a second async iterator that returns Llama Stack objects.
+
+        :param vllm_result: Stream of strings that need to be parsed
+        """
+        # Tool calls come in pieces, but Llama Stack expects them in bigger chunks. We build up
+        # those chunks and output them at the end.
+        # This data structure holds the current set of partial tool calls.
+        index_to_tool_call: Dict[int, Dict] = dict()
+
+        # The Llama Stack event stream must always start with a start event. Use an empty one to
+        # simplify logic below
+        yield ChatCompletionResponseStreamChunk(
+            event=ChatCompletionResponseEvent(
+                event_type=ChatCompletionResponseEventType.start,
+                delta=TextDelta(text=""),
+                stop_reason=None,
+            )
+        )
+
+        converted_stop_reason = None
+        async for chunk_str in vllm_result:
+            # Due to OpenAI compatibility, each event in the stream will start with "data: " and
+            # end with "\n\n".
+            _prefix = "data: "
+            _suffix = "\n\n"
+            if not chunk_str.startswith(_prefix) or not chunk_str.endswith(_suffix):
+                raise ValueError(f"Can't parse result string from vLLM: '{re.escape(chunk_str)}'")
+
+            # In between the "data: " and newlines is an event record
+            data_str = chunk_str[len(_prefix) : -len(_suffix)]
+
+            # The end of the stream is indicated with "[DONE]"
+            if data_str == "[DONE]":
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.complete,
+                        delta=TextDelta(text=""),
+                        stop_reason=converted_stop_reason,
+                    )
+                )
+                return
+
+            # Anything that is not "[DONE]" should be a JSON record
+            parsed_chunk = json.loads(data_str)
+
+            logger.debug(f"Parsed JSON event to:\n{json.dumps(parsed_chunk, indent=2)}")
+
+            # The result may contain multiple completions, but Llama Stack APIs only support
+            # returning one.
+            first_choice = parsed_chunk["choices"][0]
+            converted_stop_reason = get_stop_reason(first_choice["finish_reason"])
+            delta_record = first_choice["delta"]
+
+            if "content" in delta_record:
+                # Text delta
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=TextDelta(text=delta_record["content"]),
+                        stop_reason=converted_stop_reason,
+                    )
+                )
+            elif "tool_calls" in delta_record:
+                # Tool call(s). Llama Stack APIs do not have a clear way to return partial tool
+                # calls, so buffer until we get a "tool calls" stop reason
+                for tc in delta_record["tool_calls"]:
+                    index = tc["index"]
+                    if index not in index_to_tool_call:
+                        # First time this tool call is showing up
+                        index_to_tool_call[index] = dict()
+                    tool_call = index_to_tool_call[index]
+                    if "id" in tc:
+                        tool_call["call_id"] = tc["id"]
+                    if "function" in tc:
+                        if "name" in tc["function"]:
+                            tool_call["tool_name"] = tc["function"]["name"]
+                        if "arguments" in tc["function"]:
+                            # Arguments comes in as pieces of a string
+                            if "arguments_str" not in tool_call:
+                                tool_call["arguments_str"] = ""
+                            tool_call["arguments_str"] += tc["function"]["arguments"]
+            else:
+                raise ValueError(f"Don't know how to parse event delta: {delta_record}")
+
+            if first_choice["finish_reason"] == "tool_calls":
+                # Special OpenAI code for "tool calls complete".
+                # Output the buffered tool calls. Llama Stack requires a separate event per tool
+                # call.
+                for tool_call_record in index_to_tool_call.values():
+                    # Arguments come in as a string. Parse the completed string.
+                    tool_call_record["arguments"] = json.loads(tool_call_record["arguments_str"])
+                    del tool_call_record["arguments_str"]
+
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(tool_call=tool_call_record, parse_status="succeeded"),
+                            stop_reason=converted_stop_reason,
+                        )
+                    )
+
+        # If we get here, we've lost the connection with the vLLM event stream before it ended
+        # normally.
+        raise ValueError("vLLM event stream ended without [DONE] message.")
--- a/llama_stack/providers/inline/post_training/torchtune/init.py
+++ b/llama_stack/providers/inline/post_training/torchtune/init.py
@ -4,9 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
+from typing import Any, Dict

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import TorchtunePostTrainingConfig

@ -15,7 +15,7 @@ from .config import TorchtunePostTrainingConfig

 async def get_provider_impl(
    config: TorchtunePostTrainingConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
    from .post_training import TorchtunePostTrainingImpl

--- a/llama_stack/providers/inline/post_training/torchtune/config.py
+++ b/llama_stack/providers/inline/post_training/torchtune/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Literal, Optional
+from typing import Any, Dict, Literal, Optional

 from pydantic import BaseModel

@ -12,3 +12,9 @@ from pydantic import BaseModel
 class TorchtunePostTrainingConfig(BaseModel):
    torch_seed: Optional[int] = None
    checkpoint_format: Optional[Literal["meta", "huggingface"]] = "meta"
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "checkpoint_format": "meta",
+        }
--- a/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py
@ -43,6 +43,9 @@ class TorchtunePostTrainingImpl:
        self.jobs = {}
        self.checkpoints_dict = {}

+    async def shutdown(self):
+        pass
+
    async def supervised_fine_tune(
        self,
        job_uuid: str,
--- a/llama_stack/providers/inline/safety/code_scanner/init.py
+++ b/llama_stack/providers/inline/safety/code_scanner/init.py
@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from .config import CodeScannerConfig


-async def get_provider_impl(config: CodeScannerConfig, deps):
+async def get_provider_impl(config: CodeScannerConfig, deps: Dict[str, Any]):
    from .code_scanner import MetaReferenceCodeScannerSafetyImpl

    impl = MetaReferenceCodeScannerSafetyImpl(config, deps)
--- a/llama_stack/providers/inline/safety/code_scanner/config.py
+++ b/llama_stack/providers/inline/safety/code_scanner/config.py
@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from pydantic import BaseModel


 class CodeScannerConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/safety/llama_guard/init.py
+++ b/llama_stack/providers/inline/safety/llama_guard/init.py
@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from .config import LlamaGuardConfig


-async def get_provider_impl(config: LlamaGuardConfig, deps):
+async def get_provider_impl(config: LlamaGuardConfig, deps: Dict[str, Any]):
    from .llama_guard import LlamaGuardSafetyImpl

    assert isinstance(config, LlamaGuardConfig), f"Unexpected config type: {type(config)}"
--- a/llama_stack/providers/inline/safety/llama_guard/config.py
+++ b/llama_stack/providers/inline/safety/llama_guard/config.py
@ -4,10 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import List
+from typing import Any, Dict, List

 from pydantic import BaseModel


 class LlamaGuardConfig(BaseModel):
    excluded_categories: List[str] = []
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "excluded_categories": [],
+        }
--- a/llama_stack/providers/inline/safety/prompt_guard/init.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/init.py
@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from .config import PromptGuardConfig  # noqa: F401


-async def get_provider_impl(config: PromptGuardConfig, deps):
+async def get_provider_impl(config: PromptGuardConfig, deps: Dict[str, Any]):
    from .prompt_guard import PromptGuardSafetyImpl

    impl = PromptGuardSafetyImpl(config, deps)
--- a/llama_stack/providers/inline/safety/prompt_guard/config.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/config.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 from enum import Enum
+from typing import Any, Dict

 from pydantic import BaseModel, field_validator

@ -23,3 +24,9 @@ class PromptGuardConfig(BaseModel):
        if v not in [t.value for t in PromptGuardType]:
            raise ValueError(f"Unknown prompt guard type: {v}")
        return v
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "guard_type": "injection",
+        }
--- a/llama_stack/providers/inline/scoring/basic/init.py
+++ b/llama_stack/providers/inline/scoring/basic/init.py
@ -3,16 +3,16 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import BasicScoringConfig


 async def get_provider_impl(
    config: BasicScoringConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
    from .scoring import BasicScoringImpl

--- a/llama_stack/providers/inline/scoring/basic/config.py
+++ b/llama_stack/providers/inline/scoring/basic/config.py
@ -3,7 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel


-class BasicScoringConfig(BaseModel): ...
+class BasicScoringConfig(BaseModel):
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@ -23,10 +23,11 @@ from llama_stack.providers.utils.common.data_schema_validator import (

 from .config import BasicScoringConfig
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
+from .scoring_fn.regex_parser_math_response_scoring_fn import RegexParserMathResponseScoringFn
 from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn
 from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn

-FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn]
+FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn, RegexParserMathResponseScoringFn]


 class BasicScoringImpl(
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
@ -12,6 +12,7 @@ from llama_stack.apis.scoring_functions import (
 )

 MULTILINGUAL_ANSWER_REGEXES = [
+    r"The best answer is ",
    r"Answer\s*:",
    r"Answer\s*:",  # Korean invisible character
    r"উত্তর\s*:",
--- a/llama_stack/providers/inline/scoring/braintrust/init.py
+++ b/llama_stack/providers/inline/scoring/braintrust/init.py
@ -3,11 +3,11 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict

 from pydantic import BaseModel

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import BraintrustScoringConfig

@ -18,7 +18,7 @@ class BraintrustProviderDataValidator(BaseModel):

 async def get_provider_impl(
    config: BraintrustScoringConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
    from .braintrust import BraintrustScoringImpl

--- a/llama_stack/providers/inline/scoring/llm_as_judge/init.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/init.py
@ -3,16 +3,16 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import LlmAsJudgeScoringConfig


 async def get_provider_impl(
    config: LlmAsJudgeScoringConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
    from .scoring import LlmAsJudgeScoringImpl

--- a/llama_stack/providers/inline/scoring/llm_as_judge/config.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/config.py
@ -3,7 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel


-class LlmAsJudgeScoringConfig(BaseModel): ...
+class LlmAsJudgeScoringConfig(BaseModel):
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@ -25,7 +25,7 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 from .config import LlmAsJudgeScoringConfig
 from .scoring_fn.llm_as_judge_scoring_fn import LlmAsJudgeScoringFn

-LLM_JUDGE_FNS = [LlmAsJudgeScoringFn]
+LLM_JUDGE_FN = LlmAsJudgeScoringFn


 class LlmAsJudgeScoringImpl(
@ -43,23 +43,17 @@ class LlmAsJudgeScoringImpl(
        self.datasetio_api = datasetio_api
        self.datasets_api = datasets_api
        self.inference_api = inference_api
-        self.scoring_fn_id_impls = {}

    async def initialize(self) -> None:
-        for fn in LLM_JUDGE_FNS:
-            impl = fn(inference_api=self.inference_api)
-            for fn_defs in impl.get_supported_scoring_fn_defs():
-                self.scoring_fn_id_impls[fn_defs.identifier] = impl
-                self.llm_as_judge_fn = impl
+        impl = LLM_JUDGE_FN(inference_api=self.inference_api)
+        self.llm_as_judge_fn = impl

    async def shutdown(self) -> None: ...

    async def list_scoring_functions(self) -> List[ScoringFn]:
-        scoring_fn_defs_list = [
-            fn_def for impl in self.scoring_fn_id_impls.values() for fn_def in impl.get_supported_scoring_fn_defs()
-        ]
+        scoring_fn_defs_list = self.llm_as_judge_fn.get_supported_scoring_fn_defs()

-        for f in scoring_fn_defs_list:
+        for f in self.llm_as_judge_fn.get_supported_scoring_fn_defs():
            assert f.identifier.startswith("llm-as-judge"), (
                "All llm-as-judge scoring fn must have identifier prefixed with 'llm-as-judge'! "
            )
@ -67,7 +61,7 @@ class LlmAsJudgeScoringImpl(
        return scoring_fn_defs_list

    async def register_scoring_function(self, function_def: ScoringFn) -> None:
-        raise NotImplementedError("Register scoring function not implemented yet")
+        self.llm_as_judge_fn.register_scoring_fn_def(function_def)

    async def score_batch(
        self,
@ -102,9 +96,7 @@ class LlmAsJudgeScoringImpl(
    ) -> ScoreResponse:
        res = {}
        for scoring_fn_id in scoring_functions.keys():
-            if scoring_fn_id not in self.scoring_fn_id_impls:
-                raise ValueError(f"Scoring function {scoring_fn_id} is not supported.")
-            scoring_fn = self.scoring_fn_id_impls[scoring_fn_id]
+            scoring_fn = self.llm_as_judge_fn
            scoring_fn_params = scoring_functions.get(scoring_fn_id, None)
            score_results = await scoring_fn.score(input_rows, scoring_fn_id, scoring_fn_params)
            agg_results = await scoring_fn.aggregate(score_results, scoring_fn_id, scoring_fn_params)
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
@ -6,7 +6,7 @@
 import re
 from typing import Any, Dict, Optional

-from llama_stack.apis.inference.inference import Inference
+from llama_stack.apis.inference.inference import Inference, UserMessage
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
@ -58,10 +58,9 @@ class LlmAsJudgeScoringFn(RegisteredBaseScoringFn):
        judge_response = await self.inference_api.chat_completion(
            model_id=fn_def.params.judge_model,
            messages=[
-                {
-                    "role": "user",
-                    "content": judge_input_msg,
-                }
+                UserMessage(
+                    content=judge_input_msg,
+                ),
            ],
        )
        content = judge_response.completion_message.content
--- a/llama_stack/providers/inline/telemetry/meta_reference/config.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py
@ -44,9 +44,9 @@ class TelemetryConfig(BaseModel):
        return v

    @classmethod
-    def sample_run_config(cls, __distro_dir__: str = "runtime", db_name: str = "trace_store.db") -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]:
        return {
            "service_name": "${env.OTEL_SERVICE_NAME:llama-stack}",
            "sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
-            "sqlite_db_path": "${env.SQLITE_DB_PATH:~/.llama/" + __distro_dir__ + "/" + db_name + "}",
+            "sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}",
        }
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@ -73,6 +73,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
    def __init__(self, config: TelemetryConfig, deps: Dict[str, Any]) -> None:
        self.config = config
        self.datasetio_api = deps.get(Api.datasetio)
+        self.meter = None

        resource = Resource.create(
            {
@ -171,6 +172,8 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
        return _GLOBAL_STORAGE["gauges"][name]

    def _log_metric(self, event: MetricEvent) -> None:
+        if self.meter is None:
+            return
        if isinstance(event.value, int):
            counter = self._get_or_create_counter(event.metric, event.unit)
            counter.add(event.value, attributes=event.attributes)
--- a/llama_stack/providers/inline/telemetry/sample/init.py
+++ b/llama_stack/providers/inline/telemetry/sample/init.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import SampleConfig
-
-
-async def get_adapter_impl(config: SampleConfig, _deps) -> Any:
-    from .sample import SampleTelemetryImpl
-
-    impl = SampleTelemetryImpl(config)
-    await impl.initialize()
-    return impl
--- a/llama_stack/providers/inline/telemetry/sample/config.py
+++ b/llama_stack/providers/inline/telemetry/sample/config.py
@ -1,12 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pydantic import BaseModel
-
-
-class SampleConfig(BaseModel):
-    host: str = "localhost"
-    port: int = 9999
--- a/llama_stack/providers/inline/telemetry/sample/sample.py
+++ b/llama_stack/providers/inline/telemetry/sample/sample.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.telemetry import Telemetry
-
-from .config import SampleConfig
-
-
-class SampleTelemetryImpl(Telemetry):
-    def __init__(self, config: SampleConfig):
-        self.config = config
-
-    async def initialize(self):
-        pass
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/init.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/init.py
@ -4,12 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from .config import CodeInterpreterToolConfig

 __all__ = ["CodeInterpreterToolConfig", "CodeInterpreterToolRuntimeImpl"]


-async def get_provider_impl(config: CodeInterpreterToolConfig, _deps):
+async def get_provider_impl(config: CodeInterpreterToolConfig, _deps: Dict[str, Any]):
    from .code_interpreter import CodeInterpreterToolRuntimeImpl

    impl = CodeInterpreterToolRuntimeImpl(config)
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
@ -76,6 +76,7 @@ class CodeExecutionRequest:
    only_last_cell_fail: bool = True
    seed: int = 0
    strip_fpaths_in_stderr: bool = True
+    use_bwrap: bool = True


 class CodeExecutor:
@ -103,8 +104,6 @@ _set_seeds()\

        script = "\n\n".join([seeds_prefix] + [CODE_ENV_PREFIX] + scripts)
        with tempfile.TemporaryDirectory() as dpath:
-            bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath])
-            cmd = [*bwrap_prefix.split(), sys.executable, "-c", script]
            code_fpath = os.path.join(dpath, "code.py")
            with open(code_fpath, "w") as f:
                f.write(script)
@ -118,6 +117,13 @@ _set_seeds()\
                    MPLBACKEND="module://matplotlib_custom_backend",
                    PYTHONPATH=f"{DIRNAME}:{python_path}",
                )
+
+                if req.use_bwrap:
+                    bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath])
+                    cmd = [*bwrap_prefix.split(), sys.executable, "-c", script]
+                else:
+                    cmd = [sys.executable, "-c", script]
+
                stdout, stderr, returncode = do_subprocess(
                    cmd=cmd,
                    env=env,
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
@ -6,6 +6,7 @@


 import logging
+import os
 import tempfile
 from typing import Any, Dict, List, Optional

@ -61,7 +62,9 @@ class CodeInterpreterToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):

    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
        script = kwargs["code"]
-        req = CodeExecutionRequest(scripts=[script])
+        # Use environment variable to control bwrap usage
+        force_disable_bwrap = os.environ.get("DISABLE_CODE_SANDBOX", "").lower() in ("1", "true", "yes")
+        req = CodeExecutionRequest(scripts=[script], use_bwrap=not force_disable_bwrap)
        res = self.code_executor.execute(req)
        pieces = [res["process_status"]]
        for out_type in ["stdout", "stderr"]:
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from pydantic import BaseModel


 class CodeInterpreterToolConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/tool_runtime/rag/config.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/config.py
@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from pydantic import BaseModel


 class RagToolRuntimeConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/vector_io/chroma/init.py
+++ b/llama_stack/providers/inline/vector_io/chroma/init.py
@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
+from typing import Any, Dict

-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.datatypes import Api

 from .config import ChromaVectorIOConfig


-async def get_provider_impl(config: ChromaVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: ChromaVectorIOConfig, deps: Dict[Api, Any]):
    from llama_stack.providers.remote.vector_io.chroma.chroma import (
        ChromaVectorIOAdapter,
    )
--- a/llama_stack/providers/inline/vector_io/chroma/config.py
+++ b/llama_stack/providers/inline/vector_io/chroma/config.py
@ -13,5 +13,5 @@ class ChromaVectorIOConfig(BaseModel):
    db_path: str

    @classmethod
-    def sample_config(cls) -> Dict[str, Any]:
-        return {"db_path": "{env.CHROMADB_PATH}"}
+    def sample_run_config(cls, db_path: str = "${env.CHROMADB_PATH}", **kwargs: Any) -> Dict[str, Any]:
+        return {"db_path": db_path}
--- a/llama_stack/providers/inline/vector_io/faiss/init.py
+++ b/llama_stack/providers/inline/vector_io/faiss/init.py
@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
+from typing import Any, Dict

-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.datatypes import Api

 from .config import FaissVectorIOConfig


-async def get_provider_impl(config: FaissVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: FaissVectorIOConfig, deps: Dict[Api, Any]):
    from .faiss import FaissVectorIOAdapter

    assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}"
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import asyncio
 import base64
 import io
 import json
@ -99,7 +100,7 @@ class FaissIndex(EmbeddingIndex):
        await self._save_index()

    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
-        distances, indices = self.index.search(embedding.reshape(1, -1).astype(np.float32), k)
+        distances, indices = await asyncio.to_thread(self.index.search, embedding.reshape(1, -1).astype(np.float32), k)

        chunks = []
        scores = []
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/init.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/init.py
@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
+from typing import Any, Dict

-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.datatypes import Api

 from .config import SQLiteVectorIOConfig


-async def get_provider_impl(config: SQLiteVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: SQLiteVectorIOConfig, deps: Dict[Api, Any]):
    from .sqlite_vec import SQLiteVecVectorIOAdapter

    assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}"
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/config.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/config.py
@ -15,5 +15,5 @@ class SQLiteVectorIOConfig(BaseModel):
    @classmethod
    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
        return {
-            "db_path": "${env.SQLITE_STORE_DIR:~/.llama/" + __distro_dir__ + "}/" + "sqlite_vec.db",
+            "db_path": "${env.SQLITE_STORE_DIR:" + __distro_dir__ + "}/" + "sqlite_vec.db",
        }