Merge branch 'main' of https://github.com/meta-llama/llama-stack into add_nemo_customizer

2025-12-31 09:40:01 +00:00 · 2025-03-20 09:34:19 +00:00 · 2025-03-20 09:34:19 +00:00 · f534b4c2ea
commit f534b4c2ea
parent 87ce96c1f7 af8b4484a3
571 changed files with 229651 additions and 12956 deletions
--- a/llama_stack/providers/inline/agents/meta_reference/init.py
+++ b/llama_stack/providers/inline/agents/meta_reference/init.py
@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
+from typing import Any, Dict

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import MetaReferenceAgentsImplConfig


-async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: Dict[Api, Any]):
    from .agents import MetaReferenceAgentsImpl

    impl = MetaReferenceAgentsImpl(
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -12,6 +12,7 @@ import uuid
 from typing import AsyncGenerator, List, Optional, Union

 from llama_stack.apis.agents import (
+    Agent,
    AgentConfig,
    AgentCreateResponse,
    Agents,
@ -21,12 +22,15 @@ from llama_stack.apis.agents import (
    AgentTurnCreateRequest,
    AgentTurnResumeRequest,
    Document,
+    ListAgentSessionsResponse,
+    ListAgentsResponse,
    Session,
    Turn,
 )
 from llama_stack.apis.inference import (
    Inference,
    ToolConfig,
+    ToolResponse,
    ToolResponseMessage,
    UserMessage,
 )
@ -83,7 +87,7 @@ class MetaReferenceAgentsImpl(Agents):
            agent_id=agent_id,
        )

-    async def get_agent(self, agent_id: str) -> ChatAgent:
+    async def _get_agent_impl(self, agent_id: str) -> ChatAgent:
        agent_config = await self.persistence_store.get(
            key=f"agent:{agent_id}",
        )
@ -119,7 +123,7 @@ class MetaReferenceAgentsImpl(Agents):
        agent_id: str,
        session_name: str,
    ) -> AgentSessionCreateResponse:
-        agent = await self.get_agent(agent_id)
+        agent = await self._get_agent_impl(agent_id)

        session_id = await agent.create_session(session_name)
        return AgentSessionCreateResponse(
@ -140,7 +144,6 @@ class MetaReferenceAgentsImpl(Agents):
        documents: Optional[List[Document]] = None,
        stream: Optional[bool] = False,
        tool_config: Optional[ToolConfig] = None,
-        allow_turn_resume: Optional[bool] = False,
    ) -> AsyncGenerator:
        request = AgentTurnCreateRequest(
            agent_id=agent_id,
@ -150,7 +153,6 @@ class MetaReferenceAgentsImpl(Agents):
            toolgroups=toolgroups,
            documents=documents,
            tool_config=tool_config,
-            allow_turn_resume=allow_turn_resume,
        )
        if stream:
            return self._create_agent_turn_streaming(request)
@ -161,7 +163,7 @@ class MetaReferenceAgentsImpl(Agents):
        self,
        request: AgentTurnCreateRequest,
    ) -> AsyncGenerator:
-        agent = await self.get_agent(request.agent_id)
+        agent = await self._get_agent_impl(request.agent_id)
        async for event in agent.create_and_execute_turn(request):
            yield event

@ -170,7 +172,7 @@ class MetaReferenceAgentsImpl(Agents):
        agent_id: str,
        session_id: str,
        turn_id: str,
-        tool_responses: List[ToolResponseMessage],
+        tool_responses: List[ToolResponse],
        stream: Optional[bool] = False,
    ) -> AsyncGenerator:
        request = AgentTurnResumeRequest(
@ -189,22 +191,18 @@ class MetaReferenceAgentsImpl(Agents):
        self,
        request: AgentTurnResumeRequest,
    ) -> AsyncGenerator:
-        agent = await self.get_agent(request.agent_id)
+        agent = await self._get_agent_impl(request.agent_id)
        async for event in agent.resume_turn(request):
            yield event

    async def get_agents_turn(self, agent_id: str, session_id: str, turn_id: str) -> Turn:
-        turn = await self.persistence_store.get(f"session:{agent_id}:{session_id}:{turn_id}")
-        turn = json.loads(turn)
-        turn = Turn(**turn)
+        agent = await self._get_agent_impl(agent_id)
+        turn = await agent.storage.get_session_turn(session_id, turn_id)
        return turn

    async def get_agents_step(self, agent_id: str, session_id: str, turn_id: str, step_id: str) -> AgentStepResponse:
-        turn = await self.persistence_store.get(f"session:{agent_id}:{session_id}:{turn_id}")
-        turn = json.loads(turn)
-        turn = Turn(**turn)
-        steps = turn.steps
-        for step in steps:
+        turn = await self.get_agents_turn(agent_id, session_id, turn_id)
+        for step in turn.steps:
            if step.step_id == step_id:
                return AgentStepResponse(step=step)
        raise ValueError(f"Provided step_id {step_id} could not be found")
@ -215,20 +213,18 @@ class MetaReferenceAgentsImpl(Agents):
        session_id: str,
        turn_ids: Optional[List[str]] = None,
    ) -> Session:
-        session = await self.persistence_store.get(f"session:{agent_id}:{session_id}")
-        session = Session(**json.loads(session), turns=[])
-        turns = []
+        agent = await self._get_agent_impl(agent_id)
+        session_info = await agent.storage.get_session_info(session_id)
+        if session_info is None:
+            raise ValueError(f"Session {session_id} not found")
+        turns = await agent.storage.get_session_turns(session_id)
        if turn_ids:
-            for turn_id in turn_ids:
-                turn = await self.persistence_store.get(f"session:{agent_id}:{session_id}:{turn_id}")
-                turn = json.loads(turn)
-                turn = Turn(**turn)
-                turns.append(turn)
+            turns = [turn for turn in turns if turn.turn_id in turn_ids]
        return Session(
-            session_name=session.session_name,
+            session_name=session_info.session_name,
            session_id=session_id,
-            turns=turns if turns else [],
-            started_at=session.started_at,
+            turns=turns,
+            started_at=session_info.started_at,
        )

    async def delete_agents_session(self, agent_id: str, session_id: str) -> None:
@ -239,3 +235,15 @@ class MetaReferenceAgentsImpl(Agents):

    async def shutdown(self) -> None:
        pass
+
+    async def list_agents(self) -> ListAgentsResponse:
+        pass
+
+    async def get_agent(self, agent_id: str) -> Agent:
+        pass
+
+    async def list_agent_sessions(
+        self,
+        agent_id: str,
+    ) -> ListAgentSessionsResponse:
+        pass
--- a/llama_stack/providers/inline/agents/meta_reference/persistence.py
+++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py
@ -7,7 +7,7 @@
 import json
 import logging
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import List, Optional

 from pydantic import BaseModel
@ -21,6 +21,7 @@ log = logging.getLogger(__name__)
 class AgentSessionInfo(BaseModel):
    session_id: str
    session_name: str
+    # TODO: is this used anywhere?
    vector_db_id: Optional[str] = None
    started_at: datetime

@ -35,7 +36,7 @@ class AgentPersistence:
        session_info = AgentSessionInfo(
            session_id=session_id,
            session_name=name,
-            started_at=datetime.now(),
+            started_at=datetime.now(timezone.utc),
        )
        await self.kvstore.set(
            key=f"session:{self.agent_id}:{session_id}",
@ -85,6 +86,14 @@ class AgentPersistence:
        turns.sort(key=lambda x: (x.completed_at or datetime.min))
        return turns

+    async def get_session_turn(self, session_id: str, turn_id: str) -> Optional[Turn]:
+        value = await self.kvstore.get(
+            key=f"session:{self.agent_id}:{session_id}:{turn_id}",
+        )
+        if not value:
+            return None
+        return Turn(**json.loads(value))
+
    async def set_in_progress_tool_call_step(self, session_id: str, turn_id: str, step: ToolExecutionStep):
        await self.kvstore.set(
            key=f"in_progress_tool_call_step:{self.agent_id}:{session_id}:{turn_id}",
@ -96,3 +105,15 @@ class AgentPersistence:
            key=f"in_progress_tool_call_step:{self.agent_id}:{session_id}:{turn_id}",
        )
        return ToolExecutionStep(**json.loads(value)) if value else None
+
+    async def set_num_infer_iters_in_turn(self, session_id: str, turn_id: str, num_infer_iters: int):
+        await self.kvstore.set(
+            key=f"num_infer_iters_in_turn:{self.agent_id}:{session_id}:{turn_id}",
+            value=str(num_infer_iters),
+        )
+
+    async def get_num_infer_iters_in_turn(self, session_id: str, turn_id: str) -> Optional[int]:
+        value = await self.kvstore.get(
+            key=f"num_infer_iters_in_turn:{self.agent_id}:{session_id}:{turn_id}",
+        )
+        return int(value) if value else None
--- a/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/llama_stack/providers/inline/agents/meta_reference/safety.py
@ -10,6 +10,7 @@ from typing import List

 from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import Safety, SafetyViolation, ViolationLevel
+from llama_stack.providers.utils.telemetry import tracing

 log = logging.getLogger(__name__)

@ -32,15 +33,14 @@ class ShieldRunnerMixin:
        self.output_shields = output_shields

    async def run_multiple_shields(self, messages: List[Message], identifiers: List[str]) -> None:
-        responses = await asyncio.gather(
-            *[
-                self.safety_api.run_shield(
+        async def run_shield_with_span(identifier: str):
+            async with tracing.span(f"run_shield_{identifier}"):
+                return await self.safety_api.run_shield(
                    shield_id=identifier,
                    messages=messages,
                )
-                for identifier in identifiers
-            ]
-        )
+
+        responses = await asyncio.gather(*[run_shield_with_span(identifier) for identifier in identifiers])
        for identifier, response in zip(identifiers, responses, strict=False):
            if not response.violation:
                continue
--- a/llama_stack/providers/inline/agents/meta_reference/tests/test_chat_agent.py
+++ b/llama_stack/providers/inline/agents/meta_reference/tests/test_chat_agent.py
@ -1,400 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import tempfile
-from typing import AsyncIterator, List, Optional, Union
-
-import pytest
-
-from llama_stack.apis.agents import (
-    AgentConfig,
-    AgentToolGroupWithArgs,
-    AgentTurnCreateRequest,
-    AgentTurnResponseTurnCompletePayload,
-    StepType,
-)
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.inference import (
-    ChatCompletionResponse,
-    ChatCompletionResponseEvent,
-    ChatCompletionResponseStreamChunk,
-    CompletionMessage,
-    LogProbConfig,
-    Message,
-    ResponseFormat,
-    SamplingParams,
-    ToolChoice,
-    ToolDefinition,
-    ToolPromptFormat,
-    UserMessage,
-)
-from llama_stack.apis.safety import RunShieldResponse
-from llama_stack.apis.tools import (
-    Tool,
-    ToolDef,
-    ToolGroup,
-    ToolHost,
-    ToolInvocationResult,
-)
-from llama_stack.apis.vector_io import QueryChunksResponse
-from llama_stack.models.llama.datatypes import BuiltinTool
-from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
-    MEMORY_QUERY_TOOL,
-)
-from llama_stack.providers.inline.agents.meta_reference.agents import (
-    MetaReferenceAgentsImpl,
-    MetaReferenceAgentsImplConfig,
-)
-from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
-
-
-class MockInferenceAPI:
-    async def chat_completion(
-        self,
-        model: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = None,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
-        async def stream_response():
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type="start",
-                    delta="",
-                )
-            )
-
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type="progress",
-                    delta="AI is a fascinating field...",
-                )
-            )
-
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type="complete",
-                    delta="",
-                    stop_reason="end_of_turn",
-                )
-            )
-
-        if stream:
-            return stream_response()
-        else:
-            return ChatCompletionResponse(
-                completion_message=CompletionMessage(
-                    role="assistant",
-                    content="Mock response",
-                    stop_reason="end_of_turn",
-                ),
-                logprobs={"token_logprobs": [0.1, 0.2, 0.3]} if logprobs else None,
-            )
-
-
-class MockSafetyAPI:
-    async def run_shield(self, shield_id: str, messages: List[Message]) -> RunShieldResponse:
-        return RunShieldResponse(violation=None)
-
-
-class MockVectorIOAPI:
-    def __init__(self):
-        self.chunks = {}
-
-    async def insert_chunks(self, vector_db_id, chunks, ttl_seconds=None):
-        for chunk in chunks:
-            metadata = chunk.metadata
-            self.chunks[vector_db_id][metadata["document_id"]] = chunk
-
-    async def query_chunks(self, vector_db_id, query, params=None):
-        if vector_db_id not in self.chunks:
-            raise ValueError(f"Bank {vector_db_id} not found")
-
-        chunks = list(self.chunks[vector_db_id].values())
-        scores = [1.0] * len(chunks)
-        return QueryChunksResponse(chunks=chunks, scores=scores)
-
-
-class MockToolGroupsAPI:
-    async def register_tool_group(self, toolgroup_id: str, provider_id: str, mcp_endpoint=None, args=None) -> None:
-        pass
-
-    async def get_tool_group(self, toolgroup_id: str) -> ToolGroup:
-        return ToolGroup(
-            identifier=toolgroup_id,
-            provider_resource_id=toolgroup_id,
-        )
-
-    async def list_tool_groups(self) -> List[ToolGroup]:
-        return []
-
-    async def list_tools(self, tool_group_id: Optional[str] = None) -> List[Tool]:
-        if tool_group_id == MEMORY_TOOLGROUP:
-            return [
-                Tool(
-                    identifier=MEMORY_QUERY_TOOL,
-                    provider_resource_id=MEMORY_QUERY_TOOL,
-                    toolgroup_id=MEMORY_TOOLGROUP,
-                    tool_host=ToolHost.client,
-                    description="Mock tool",
-                    provider_id="builtin::rag",
-                    parameters=[],
-                )
-            ]
-        if tool_group_id == CODE_INTERPRETER_TOOLGROUP:
-            return [
-                Tool(
-                    identifier="code_interpreter",
-                    provider_resource_id="code_interpreter",
-                    toolgroup_id=CODE_INTERPRETER_TOOLGROUP,
-                    tool_host=ToolHost.client,
-                    description="Mock tool",
-                    provider_id="builtin::code_interpreter",
-                    parameters=[],
-                )
-            ]
-        return []
-
-    async def get_tool(self, tool_name: str) -> Tool:
-        return Tool(
-            identifier=tool_name,
-            provider_resource_id=tool_name,
-            toolgroup_id="mock_group",
-            tool_host=ToolHost.client,
-            description="Mock tool",
-            provider_id="mock_provider",
-            parameters=[],
-        )
-
-    async def unregister_tool_group(self, tool_group_id: str) -> None:
-        pass
-
-
-class MockToolRuntimeAPI:
-    async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
-    ) -> List[ToolDef]:
-        return []
-
-    async def invoke_tool(self, tool_name: str, args: dict) -> ToolInvocationResult:
-        return ToolInvocationResult(content={"result": "Mock tool result"})
-
-
-@pytest.fixture
-def mock_inference_api():
-    return MockInferenceAPI()
-
-
-@pytest.fixture
-def mock_safety_api():
-    return MockSafetyAPI()
-
-
-@pytest.fixture
-def mock_vector_io_api():
-    return MockVectorIOAPI()
-
-
-@pytest.fixture
-def mock_tool_groups_api():
-    return MockToolGroupsAPI()
-
-
-@pytest.fixture
-def mock_tool_runtime_api():
-    return MockToolRuntimeAPI()
-
-
-@pytest.fixture
-async def get_agents_impl(
-    mock_inference_api,
-    mock_safety_api,
-    mock_vector_io_api,
-    mock_tool_runtime_api,
-    mock_tool_groups_api,
-):
-    sqlite_file = tempfile.NamedTemporaryFile(delete=False, suffix=".db")
-    impl = MetaReferenceAgentsImpl(
-        config=MetaReferenceAgentsImplConfig(
-            persistence_store=SqliteKVStoreConfig(
-                db_name=sqlite_file.name,
-            ),
-        ),
-        inference_api=mock_inference_api,
-        safety_api=mock_safety_api,
-        vector_io_api=mock_vector_io_api,
-        tool_runtime_api=mock_tool_runtime_api,
-        tool_groups_api=mock_tool_groups_api,
-    )
-    await impl.initialize()
-    return impl
-
-
-@pytest.fixture
-async def get_chat_agent(get_agents_impl):
-    impl = await get_agents_impl
-    agent_config = AgentConfig(
-        model="test_model",
-        instructions="You are a helpful assistant.",
-        toolgroups=[],
-        tool_choice=ToolChoice.auto,
-        enable_session_persistence=False,
-        input_shields=["test_shield"],
-    )
-    response = await impl.create_agent(agent_config)
-    return await impl.get_agent(response.agent_id)
-
-
-MEMORY_TOOLGROUP = "builtin::rag"
-CODE_INTERPRETER_TOOLGROUP = "builtin::code_interpreter"
-
-
-@pytest.fixture
-async def get_chat_agent_with_tools(get_agents_impl, request):
-    impl = await get_agents_impl
-    toolgroups = request.param
-    agent_config = AgentConfig(
-        model="test_model",
-        instructions="You are a helpful assistant.",
-        toolgroups=toolgroups,
-        tool_choice=ToolChoice.auto,
-        enable_session_persistence=False,
-        input_shields=["test_shield"],
-    )
-    response = await impl.create_agent(agent_config)
-    return await impl.get_agent(response.agent_id)
-
-
-@pytest.mark.asyncio
-async def test_chat_agent_create_and_execute_turn(get_chat_agent):
-    chat_agent = await get_chat_agent
-    session_id = await chat_agent.create_session("Test Session")
-    request = AgentTurnCreateRequest(
-        agent_id=chat_agent.agent_id,
-        session_id=session_id,
-        messages=[UserMessage(content="Hello")],
-        stream=True,
-    )
-
-    responses = []
-    async for response in chat_agent.create_and_execute_turn(request):
-        responses.append(response)
-
-    assert len(responses) > 0
-    assert (
-        len(responses) == 7
-    )  # TurnStart, ShieldCallStart, ShieldCallComplete, StepStart, StepProgress, StepComplete, TurnComplete
-    assert responses[0].event.payload.turn_id is not None
-
-
-@pytest.mark.asyncio
-async def test_run_multiple_shields_wrapper(get_chat_agent):
-    chat_agent = await get_chat_agent
-    messages = [UserMessage(content="Test message")]
-    shields = ["test_shield"]
-
-    responses = [
-        chunk
-        async for chunk in chat_agent.run_multiple_shields_wrapper(
-            turn_id="test_turn_id",
-            messages=messages,
-            shields=shields,
-            touchpoint="user-input",
-        )
-    ]
-
-    assert len(responses) == 2  # StepStart, StepComplete
-    assert responses[0].event.payload.step_type.value == "shield_call"
-    assert not responses[1].event.payload.step_details.violation
-
-
-@pytest.mark.asyncio
-async def test_chat_agent_complex_turn(get_chat_agent):
-    chat_agent = await get_chat_agent
-    session_id = await chat_agent.create_session("Test Session")
-    request = AgentTurnCreateRequest(
-        agent_id=chat_agent.agent_id,
-        session_id=session_id,
-        messages=[UserMessage(content="Tell me about AI and then use a tool.")],
-        stream=True,
-    )
-
-    responses = []
-    async for response in chat_agent.create_and_execute_turn(request):
-        responses.append(response)
-
-    assert len(responses) > 0
-
-    step_types = [
-        response.event.payload.step_type for response in responses if hasattr(response.event.payload, "step_type")
-    ]
-
-    assert StepType.shield_call in step_types, "Shield call step is missing"
-    assert StepType.inference in step_types, "Inference step is missing"
-
-    event_types = [
-        response.event.payload.event_type for response in responses if hasattr(response.event.payload, "event_type")
-    ]
-    assert "turn_start" in event_types, "Start event is missing"
-    assert "turn_complete" in event_types, "Complete event is missing"
-
-    assert any(isinstance(response.event.payload, AgentTurnResponseTurnCompletePayload) for response in responses), (
-        "Turn complete event is missing"
-    )
-    turn_complete_payload = next(
-        response.event.payload
-        for response in responses
-        if isinstance(response.event.payload, AgentTurnResponseTurnCompletePayload)
-    )
-    turn = turn_complete_payload.turn
-    assert turn.input_messages == request.messages, "Input messages do not match"
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "toolgroups, expected_memory, expected_code_interpreter",
-    [
-        ([], False, False),  # no tools
-        ([MEMORY_TOOLGROUP], True, False),  # memory only
-        ([CODE_INTERPRETER_TOOLGROUP], False, True),  # code interpreter only
-        ([MEMORY_TOOLGROUP, CODE_INTERPRETER_TOOLGROUP], True, True),  # all tools
-    ],
-)
-async def test_chat_agent_tools(get_agents_impl, toolgroups, expected_memory, expected_code_interpreter):
-    impl = await get_agents_impl
-    agent_config = AgentConfig(
-        model="test_model",
-        instructions="You are a helpful assistant.",
-        toolgroups=toolgroups,
-        tool_choice=ToolChoice.auto,
-        enable_session_persistence=False,
-        input_shields=["test_shield"],
-    )
-    response = await impl.create_agent(agent_config)
-    chat_agent = await impl.get_agent(response.agent_id)
-
-    tool_defs, _ = await chat_agent._get_tool_defs()
-    if expected_memory:
-        assert MEMORY_QUERY_TOOL in tool_defs
-    if expected_code_interpreter:
-        assert BuiltinTool.code_interpreter in tool_defs
-    if expected_memory and expected_code_interpreter:
-        # override the tools for turn
-        new_tool_defs, _ = await chat_agent._get_tool_defs(
-            toolgroups_for_turn=[
-                AgentToolGroupWithArgs(
-                    name=MEMORY_TOOLGROUP,
-                    args={"vector_dbs": ["test_vector_db"]},
-                )
-            ]
-        )
-        assert MEMORY_QUERY_TOOL in new_tool_defs
-        assert BuiltinTool.code_interpreter not in new_tool_defs
--- a/llama_stack/providers/inline/datasetio/localfs/init.py
+++ b/llama_stack/providers/inline/datasetio/localfs/init.py
@ -4,12 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from .config import LocalFSDatasetIOConfig


 async def get_provider_impl(
    config: LocalFSDatasetIOConfig,
-    _deps,
+    _deps: Dict[str, Any],
 ):
    from .datasetio import LocalFSDatasetIOImpl

--- a/llama_stack/providers/inline/datasetio/localfs/config.py
+++ b/llama_stack/providers/inline/datasetio/localfs/config.py
@ -3,9 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel

-from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 from llama_stack.providers.utils.kvstore.config import (
    KVStoreConfig,
    SqliteKVStoreConfig,
@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import (


 class LocalFSDatasetIOConfig(BaseModel):
-    kvstore: KVStoreConfig = SqliteKVStoreConfig(
-        db_path=(RUNTIME_BASE_DIR / "localfs_datasetio.db").as_posix()
-    )  # Uses SQLite config specific to localfs storage
+    kvstore: KVStoreConfig
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="localfs_datasetio.db",
+            )
+        }
--- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@ -3,20 +3,14 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import base64
-import os
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
-from urllib.parse import urlparse

 import pandas

-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult
+from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
 from llama_stack.apis.datasets import Dataset
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
-from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_url
+from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
 from llama_stack.providers.utils.kvstore import kvstore_impl

 from .config import LocalFSDatasetIOConfig
@ -24,30 +18,7 @@ from .config import LocalFSDatasetIOConfig
 DATASETS_PREFIX = "localfs_datasets:"


-class BaseDataset(ABC):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-
-    @abstractmethod
-    def __len__(self) -> int:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def __getitem__(self, idx):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def load(self):
-        raise NotImplementedError()
-
-
-@dataclass
-class DatasetInfo:
-    dataset_def: Dataset
-    dataset_impl: BaseDataset
-
-
-class PandasDataframeDataset(BaseDataset):
+class PandasDataframeDataset:
    def __init__(self, dataset_def: Dataset, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.dataset_def = dataset_def
@ -64,23 +35,19 @@ class PandasDataframeDataset(BaseDataset):
        else:
            return self.df.iloc[idx].to_dict()

-    def _validate_dataset_schema(self, df) -> pandas.DataFrame:
-        # note that we will drop any columns in dataset that are not in the schema
-        df = df[self.dataset_def.dataset_schema.keys()]
-        # check all columns in dataset schema are present
-        assert len(df.columns) == len(self.dataset_def.dataset_schema)
-        # TODO: type checking against column types in dataset schema
-        return df
-
-    def load(self) -> None:
+    async def load(self) -> None:
        if self.df is not None:
            return

-        df = get_dataframe_from_url(self.dataset_def.url)
-        if df is None:
-            raise ValueError(f"Failed to load dataset from {self.dataset_def.url}")
+        if self.dataset_def.source.type == "uri":
+            self.df = await get_dataframe_from_uri(self.dataset_def.source.uri)
+        elif self.dataset_def.source.type == "rows":
+            self.df = pandas.DataFrame(self.dataset_def.source.rows)
+        else:
+            raise ValueError(f"Unsupported dataset source type: {self.dataset_def.source.type}")

-        self.df = self._validate_dataset_schema(df)
+        if self.df is None:
+            raise ValueError(f"Failed to load dataset from {self.dataset_def.url}")


 class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
@ -99,95 +66,55 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):

        for dataset in stored_datasets:
            dataset = Dataset.model_validate_json(dataset)
-            dataset_impl = PandasDataframeDataset(dataset)
-            self.dataset_infos[dataset.identifier] = DatasetInfo(
-                dataset_def=dataset,
-                dataset_impl=dataset_impl,
-            )
+            self.dataset_infos[dataset.identifier] = dataset

    async def shutdown(self) -> None: ...

    async def register_dataset(
        self,
-        dataset: Dataset,
+        dataset_def: Dataset,
    ) -> None:
        # Store in kvstore
-        key = f"{DATASETS_PREFIX}{dataset.identifier}"
+        key = f"{DATASETS_PREFIX}{dataset_def.identifier}"
        await self.kvstore.set(
            key=key,
-            value=dataset.json(),
-        )
-        dataset_impl = PandasDataframeDataset(dataset)
-        self.dataset_infos[dataset.identifier] = DatasetInfo(
-            dataset_def=dataset,
-            dataset_impl=dataset_impl,
+            value=dataset_def.model_dump_json(),
        )
+        self.dataset_infos[dataset_def.identifier] = dataset_def

    async def unregister_dataset(self, dataset_id: str) -> None:
        key = f"{DATASETS_PREFIX}{dataset_id}"
        await self.kvstore.delete(key=key)
        del self.dataset_infos[dataset_id]

-    async def get_rows_paginated(
+    async def iterrows(
        self,
        dataset_id: str,
-        rows_in_page: int,
-        page_token: Optional[str] = None,
-        filter_condition: Optional[str] = None,
-    ) -> PaginatedRowsResult:
-        dataset_info = self.dataset_infos.get(dataset_id)
-        dataset_info.dataset_impl.load()
+        start_index: Optional[int] = None,
+        limit: Optional[int] = None,
+    ) -> IterrowsResponse:
+        dataset_def = self.dataset_infos[dataset_id]
+        dataset_impl = PandasDataframeDataset(dataset_def)
+        await dataset_impl.load()

-        if page_token and not page_token.isnumeric():
-            raise ValueError("Invalid page_token")
+        start_index = start_index or 0

-        if page_token is None or len(page_token) == 0:
-            next_page_token = 0
+        if limit is None or limit == -1:
+            end = len(dataset_impl)
        else:
-            next_page_token = int(page_token)
+            end = min(start_index + limit, len(dataset_impl))

-        start = next_page_token
-        if rows_in_page == -1:
-            end = len(dataset_info.dataset_impl)
-        else:
-            end = min(start + rows_in_page, len(dataset_info.dataset_impl))
+        rows = dataset_impl[start_index:end]

-        rows = dataset_info.dataset_impl[start:end]
-
-        return PaginatedRowsResult(
-            rows=rows,
-            total_count=len(rows),
-            next_page_token=str(end),
+        return IterrowsResponse(
+            data=rows,
+            next_start_index=end if end < len(dataset_impl) else None,
        )

    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
-        dataset_info = self.dataset_infos.get(dataset_id)
-        if dataset_info is None:
-            raise ValueError(f"Dataset with id {dataset_id} not found")
-
-        dataset_impl = dataset_info.dataset_impl
-        dataset_impl.load()
+        dataset_def = self.dataset_infos[dataset_id]
+        dataset_impl = PandasDataframeDataset(dataset_def)
+        await dataset_impl.load()

        new_rows_df = pandas.DataFrame(rows)
-        new_rows_df = dataset_impl._validate_dataset_schema(new_rows_df)
        dataset_impl.df = pandas.concat([dataset_impl.df, new_rows_df], ignore_index=True)
-
-        url = str(dataset_info.dataset_def.url)
-        parsed_url = urlparse(url)
-
-        if parsed_url.scheme == "file" or not parsed_url.scheme:
-            file_path = parsed_url.path
-            os.makedirs(os.path.dirname(file_path), exist_ok=True)
-            dataset_impl.df.to_csv(file_path, index=False)
-        elif parsed_url.scheme == "data":
-            # For data URLs, we need to update the base64-encoded content
-            if not parsed_url.path.startswith("text/csv;base64,"):
-                raise ValueError("Data URL must be a base64-encoded CSV")
-
-            csv_buffer = dataset_impl.df.to_csv(index=False)
-            base64_content = base64.b64encode(csv_buffer.encode("utf-8")).decode("utf-8")
-            dataset_info.dataset_def.url = URL(uri=f"data:text/csv;base64,{base64_content}")
-        else:
-            raise ValueError(
-                f"Unsupported URL scheme: {parsed_url.scheme}. Only file:// and data: URLs are supported for writing."
-            )
--- a/llama_stack/providers/inline/eval/meta_reference/init.py
+++ b/llama_stack/providers/inline/eval/meta_reference/init.py
@ -3,16 +3,16 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import MetaReferenceEvalConfig


 async def get_provider_impl(
    config: MetaReferenceEvalConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
    from .eval import MetaReferenceEvalImpl

--- a/llama_stack/providers/inline/eval/meta_reference/config.py
+++ b/llama_stack/providers/inline/eval/meta_reference/config.py
@ -3,9 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel

-from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 from llama_stack.providers.utils.kvstore.config import (
    KVStoreConfig,
    SqliteKVStoreConfig,
@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import (


 class MetaReferenceEvalConfig(BaseModel):
-    kvstore: KVStoreConfig = SqliteKVStoreConfig(
-        db_path=(RUNTIME_BASE_DIR / "meta_reference_eval.db").as_posix()
-    )  # Uses SQLite config specific to Meta Reference Eval storage
+    kvstore: KVStoreConfig
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="meta_reference_eval.db",
+            )
+        }
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -3,6 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import json
 from typing import Any, Dict, List, Optional

 from tqdm import tqdm
@ -11,18 +12,13 @@ from llama_stack.apis.agents import Agents, StepType
 from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.inference import Inference, UserMessage
+from llama_stack.apis.inference import Inference, SystemMessage, UserMessage
 from llama_stack.apis.scoring import Scoring
-from llama_stack.distribution.datatypes import Api
 from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
 from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
    MEMORY_QUERY_TOOL,
 )
-from llama_stack.providers.utils.common.data_schema_validator import (
-    ColumnName,
-    get_valid_schemas,
-    validate_dataset_schema,
-)
+from llama_stack.providers.utils.common.data_schema_validator import ColumnName
 from llama_stack.providers.utils.kvstore import kvstore_impl

 from .....apis.common.job_types import Job
@ -82,23 +78,24 @@ class MetaReferenceEvalImpl(
    async def run_eval(
        self,
        benchmark_id: str,
-        task_config: BenchmarkConfig,
+        benchmark_config: BenchmarkConfig,
    ) -> Job:
        task_def = self.benchmarks[benchmark_id]
        dataset_id = task_def.dataset_id
-        candidate = task_config.eval_candidate
        scoring_functions = task_def.scoring_functions
-        dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
-        validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.eval.value))
-        all_rows = await self.datasetio_api.get_rows_paginated(
+
+        # TODO (xiyan): validate dataset schema
+        # dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
+
+        all_rows = await self.datasetio_api.iterrows(
            dataset_id=dataset_id,
-            rows_in_page=(-1 if task_config.num_examples is None else task_config.num_examples),
+            limit=(-1 if benchmark_config.num_examples is None else benchmark_config.num_examples),
        )
        res = await self.evaluate_rows(
            benchmark_id=benchmark_id,
-            input_rows=all_rows.rows,
+            input_rows=all_rows.data,
            scoring_functions=scoring_functions,
-            task_config=task_config,
+            benchmark_config=benchmark_config,
        )

        # TODO: currently needs to wait for generation before returning
@ -108,17 +105,17 @@ class MetaReferenceEvalImpl(
        return Job(job_id=job_id)

    async def _run_agent_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
+        self, input_rows: List[Dict[str, Any]], benchmark_config: BenchmarkConfig
    ) -> List[Dict[str, Any]]:
-        candidate = task_config.eval_candidate
+        candidate = benchmark_config.eval_candidate
        create_response = await self.agents_api.create_agent(candidate.config)
        agent_id = create_response.agent_id

        generations = []
        for i, x in tqdm(enumerate(input_rows)):
            assert ColumnName.chat_completion_input.value in x, "Invalid input row"
-            input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
-            input_messages = [UserMessage(**x) for x in input_messages]
+            input_messages = json.loads(x[ColumnName.chat_completion_input.value])
+            input_messages = [UserMessage(**x) for x in input_messages if x["role"] == "user"]

            # NOTE: only single-turn agent generation is supported. Create a new session for each input row
            session_create_response = await self.agents_api.create_agent_session(agent_id, f"session-{i}")
@ -151,15 +148,15 @@ class MetaReferenceEvalImpl(
        return generations

    async def _run_model_generation(
-        self, input_rows: List[Dict[str, Any]], task_config: BenchmarkConfig
+        self, input_rows: List[Dict[str, Any]], benchmark_config: BenchmarkConfig
    ) -> List[Dict[str, Any]]:
-        candidate = task_config.eval_candidate
+        candidate = benchmark_config.eval_candidate
        assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"

        generations = []
        for x in tqdm(input_rows):
            if ColumnName.completion_input.value in x:
-                input_content = eval(str(x[ColumnName.completion_input.value]))
+                input_content = json.loads(x[ColumnName.completion_input.value])
                response = await self.inference_api.completion(
                    model=candidate.model,
                    content=input_content,
@ -167,12 +164,12 @@ class MetaReferenceEvalImpl(
                )
                generations.append({ColumnName.generated_answer.value: response.completion_message.content})
            elif ColumnName.chat_completion_input.value in x:
-                chat_completion_input_str = str(x[ColumnName.chat_completion_input.value])
-                input_messages = eval(chat_completion_input_str)
-                input_messages = [UserMessage(**x) for x in input_messages]
+                chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value])
+                input_messages = [UserMessage(**x) for x in chat_completion_input_json if x["role"] == "user"]
                messages = []
                if candidate.system_message:
                    messages.append(candidate.system_message)
+                messages += [SystemMessage(**x) for x in chat_completion_input_json if x["role"] == "system"]
                messages += input_messages
                response = await self.inference_api.chat_completion(
                    model_id=candidate.model,
@ -190,13 +187,13 @@ class MetaReferenceEvalImpl(
        benchmark_id: str,
        input_rows: List[Dict[str, Any]],
        scoring_functions: List[str],
-        task_config: BenchmarkConfig,
+        benchmark_config: BenchmarkConfig,
    ) -> EvaluateResponse:
-        candidate = task_config.eval_candidate
+        candidate = benchmark_config.eval_candidate
        if candidate.type == "agent":
-            generations = await self._run_agent_generation(input_rows, task_config)
+            generations = await self._run_agent_generation(input_rows, benchmark_config)
        elif candidate.type == "model":
-            generations = await self._run_model_generation(input_rows, task_config)
+            generations = await self._run_model_generation(input_rows, benchmark_config)
        else:
            raise ValueError(f"Invalid candidate type: {candidate.type}")

@ -205,9 +202,9 @@ class MetaReferenceEvalImpl(
            input_r | generated_r for input_r, generated_r in zip(input_rows, generations, strict=False)
        ]

-        if task_config.scoring_params is not None:
+        if benchmark_config.scoring_params is not None:
            scoring_functions_dict = {
-                scoring_fn_id: task_config.scoring_params.get(scoring_fn_id, None)
+                scoring_fn_id: benchmark_config.scoring_params.get(scoring_fn_id, None)
                for scoring_fn_id in scoring_functions
            }
        else:
--- a/llama_stack/providers/inline/inference/meta_reference/init.py
+++ b/llama_stack/providers/inline/inference/meta_reference/init.py
@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Union
+from typing import Any, Dict, Union

 from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig


 async def get_provider_impl(
    config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig],
-    _deps,
+    _deps: Dict[str, Any],
 ):
    from .inference import MetaReferenceInferenceImpl

--- a/llama_stack/providers/inline/inference/meta_reference/common.py
+++ b/llama_stack/providers/inline/inference/meta_reference/common.py
@ -0,0 +1,33 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+from llama_stack.distribution.utils.model_utils import model_local_dir
+
+
+class TokenResult(BaseModel):
+    token: int
+    text: str
+    logprobs: Optional[List[float]] = None
+
+
+def model_checkpoint_dir(model_id) -> str:
+    checkpoint_dir = Path(model_local_dir(model_id))
+
+    paths = [Path(checkpoint_dir / f"consolidated.{ext}") for ext in ["pth", "00.pth"]]
+    if not any(p.exists() for p in paths):
+        checkpoint_dir = checkpoint_dir / "original"
+
+    assert checkpoint_dir.exists(), (
+        f"Could not find checkpoints in: {model_local_dir(model_id)}. "
+        f"If you try to use the native llama model, Please download model using `llama download --model-id {model_id}`"
+        f"Otherwise, please save you model checkpoint under {model_local_dir(model_id)}"
+    )
+    return str(checkpoint_dir)
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -55,7 +55,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 )

 from .config import MetaReferenceInferenceConfig
-from .generation import Llama
+from .llama3.generation import Llama3
 from .model_parallel import LlamaModelParallelGenerator

 log = logging.getLogger(__name__)
@ -83,7 +83,7 @@ class MetaReferenceInferenceImpl(
            self.generator = LlamaModelParallelGenerator(self.config, model_id, llama_model)
            self.generator.start()
        else:
-            self.generator = Llama.build(self.config, model_id, llama_model)
+            self.generator = Llama3.build(self.config, model_id, llama_model)

        self.model_id = model_id
        self.llama_model = llama_model
@ -111,7 +111,7 @@ class MetaReferenceInferenceImpl(
        )
        if llama_model is None:
            raise ValueError(
-                "Please make sure your llama_model in model metadata or model identifier is in llama-models SKU list"
+                "Please make sure your llama_model in model metadata or model identifier is in Llama SKU list"
            )

        self.model_registry_helper = ModelRegistryHelper(
@ -136,11 +136,13 @@ class MetaReferenceInferenceImpl(
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        sampling_params: Optional[SamplingParams] = None,
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
+        if sampling_params is None:
+            sampling_params = SamplingParams()
        if logprobs:
            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"

@ -208,7 +210,6 @@ class MetaReferenceInferenceImpl(
            logprobs = []
            stop_reason = None

-            tokenizer = self.generator.formatter.tokenizer
            for token_result in self.generator.completion(request):
                tokens.append(token_result.token)
                if token_result.text == "<|eot_id|>":
@ -245,7 +246,7 @@ class MetaReferenceInferenceImpl(
        self,
        model_id: str,
        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        sampling_params: Optional[SamplingParams] = None,
        response_format: Optional[ResponseFormat] = None,
        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
@ -254,6 +255,8 @@ class MetaReferenceInferenceImpl(
        logprobs: Optional[LogProbConfig] = None,
        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
+        if sampling_params is None:
+            sampling_params = SamplingParams()
        if logprobs:
            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"

--- a/llama_stack/providers/inline/inference/meta_reference/llama3/args.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/args.py
@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# top-level folder for each specific model found within the models/ directory at
+# the top-level of this source tree.
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+
+
+class QuantizationScheme(Enum):
+    int4_weight_int8_dynamic_activation = "int4_weight_int8_dynamic_activation"
+
+
+@dataclass
+class QuantizationArgs:
+    scheme: Optional[QuantizationScheme] = None
+    group_size: Optional[int] = None
+    spinquant: bool = False
+
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if k == "scheme":
+                setattr(self, k, QuantizationScheme(v))
+            else:
+                if hasattr(self, k):
+                    setattr(self, k, v)
+
+
+@dataclass
+class LoRAArgs:
+    rank: int
+    scale: float
+
+
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    rope_theta: float = 500000
+    use_scaled_rope: bool = False
+
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+    # vision model params
+    vision_chunk_size: int = -1  # image resolution for image models
+    vision_max_num_chunks: int = 4
+    vision_num_cross_attention_layers: int = -1
+
+    quantization_args: Optional[QuantizationArgs] = None
+    lora_args: Optional[LoRAArgs] = None
+
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if k == "lora_args":
+                setattr(self, k, LoRAArgs(**v))
+            elif k == "quantization_args":
+                setattr(self, k, QuantizationArgs(**v))
+            else:
+                if hasattr(self, k):
+                    setattr(self, k, v)
+
+        if self.n_kv_heads is None:
+            self.n_kv_heads = self.n_heads
+        assert self.n_kv_heads <= self.n_heads
+        assert self.n_heads % self.n_kv_heads == 0
+        assert self.dim % self.n_heads == 0
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/generation.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/generation.py
@ -23,15 +23,7 @@ from fairscale.nn.model_parallel.initialize import (
    initialize_model_parallel,
    model_parallel_is_initialized,
 )
-from llama_models.llama3.api.args import ModelArgs
-from llama_models.llama3.api.chat_format import ChatFormat, LLMInput
-from llama_models.llama3.api.tokenizer import Tokenizer
-from llama_models.llama3.reference_impl.model import Transformer
-from llama_models.llama3.reference_impl.multimodal.model import (
-    CrossAttentionTransformer,
-)
 from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
-from pydantic import BaseModel

 from llama_stack.apis.inference import (
    Fp8QuantizationConfig,
@ -39,46 +31,30 @@ from llama_stack.apis.inference import (
    ResponseFormat,
    ResponseFormatType,
 )
-from llama_stack.distribution.utils.model_utils import model_local_dir
 from llama_stack.models.llama.datatypes import (
    GreedySamplingStrategy,
    Model,
    SamplingParams,
    TopPSamplingStrategy,
 )
+from llama_stack.models.llama.llama3.chat_format import ChatFormat, LLMInput
+from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.providers.utils.inference.prompt_adapter import (
    ChatCompletionRequestWithRawContent,
    CompletionRequestWithRawContent,
 )

-from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
+from ..common import TokenResult, model_checkpoint_dir
+from ..config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
+from .args import ModelArgs
+from .model import Transformer
+from .multimodal.model import CrossAttentionTransformer

 log = logging.getLogger(__name__)


-def model_checkpoint_dir(model_id) -> str:
-    checkpoint_dir = Path(model_local_dir(model_id))
-
-    paths = [Path(checkpoint_dir / f"consolidated.{ext}") for ext in ["pth", "00.pth"]]
-    if not any(p.exists() for p in paths):
-        checkpoint_dir = checkpoint_dir / "original"
-
-    assert checkpoint_dir.exists(), (
-        f"Could not find checkpoints in: {model_local_dir(model_id)}. "
-        f"If you try to use the native llama model, Please download model using `llama download --model-id {model_id}`"
-        f"Otherwise, please save you model checkpoint under {model_local_dir(model_id)}"
-    )
-    return str(checkpoint_dir)
-
-
-class TokenResult(BaseModel):
-    token: int
-    text: str
-    logprobs: Optional[List[float]] = None
-
-
-class Llama:
+class Llama3:
    @staticmethod
    def build(
        config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig],
@ -170,7 +146,7 @@ class Llama:

        if isinstance(config, MetaReferenceQuantizedInferenceConfig):
            if isinstance(config.quantization, Fp8QuantizationConfig):
-                from .quantization.loader import convert_to_fp8_quantized_model
+                from ..quantization.loader import convert_to_fp8_quantized_model

                # load on CPU in bf16 so that fp8 conversion does not find an
                # unexpected (fp32, e.g.) datatype
@ -183,7 +159,7 @@ class Llama:
                model.load_state_dict(state_dict, strict=False)
                model = convert_to_fp8_quantized_model(model, config, ckpt_dir)
            elif isinstance(config.quantization, Int4QuantizationConfig):
-                from .quantization.loader import convert_to_int4_quantized_model
+                from ..quantization.loader import convert_to_int4_quantized_model

                model = Transformer(model_args)
                model = convert_to_int4_quantized_model(model, model_args, config)
@ -193,7 +169,7 @@ class Llama:
                    # Add a wrapper for adding hadamard transform for spinquant.
                    # This needs to be done after loading the state dict otherwise an error will be raised while
                    # loading the state dict.
-                    from .quantization.hadamard_utils import (
+                    from ..quantization.hadamard_utils import (
                        add_hadamard_transform_for_spinquant,
                    )

@ -222,7 +198,7 @@ class Llama:
        model.to(device)

        log.info(f"Loaded in {time.time() - start_time:.2f} seconds")
-        return Llama(model, tokenizer, model_args, llama_model_id)
+        return Llama3(model, tokenizer, model_args, llama_model_id)

    def __init__(
        self,
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/model.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/model.py
@ -0,0 +1,311 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# top-level folder for each specific model found within the models/ directory at
+# the top-level of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import math
+from typing import Optional, Tuple
+
+import fairscale.nn.model_parallel.initialize as fs_init
+import torch
+import torch.nn.functional as F
+from fairscale.nn.model_parallel.layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+)
+from torch import nn
+
+from .args import ModelArgs
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+def apply_scaling(freqs: torch.Tensor) -> torch.Tensor:
+    # Values obtained from grid search
+    scale_factor = 8
+    low_freq_factor = 1
+    high_freq_factor = 4
+    old_context_len = 8192  # original llama3 length
+
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+
+    wavelen = 2 * torch.pi / freqs
+    new_freqs = torch.where(wavelen > low_freq_wavelen, freqs / scale_factor, freqs)
+    smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+    return torch.where(
+        (wavelen >= high_freq_wavelen) & (wavelen <= low_freq_wavelen),
+        (1 - smooth) * new_freqs / scale_factor + smooth * new_freqs,
+        new_freqs,
+    )
+
+
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, use_scaled: bool = False):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
+    if use_scaled:
+        freqs = apply_scaling(freqs)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+
+
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+
+
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        model_parallel_size = fs_init.get_model_parallel_world_size()
+        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+
+        self.wq = ColumnParallelLinear(
+            args.dim,
+            args.n_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wk = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wv = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wo = RowParallelLinear(
+            args.n_heads * self.head_dim,
+            args.dim,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+
+        self.cache_k = torch.zeros(
+            (
+                args.max_batch_size,
+                args.max_seq_len,
+                self.n_local_kv_heads,
+                self.head_dim,
+            )
+        )
+        self.cache_v = torch.zeros(
+            (
+                args.max_batch_size,
+                args.max_seq_len,
+                self.n_local_kv_heads,
+                self.head_dim,
+            )
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        start_pos: int,
+        freqs_cis: torch.Tensor,
+        mask: Optional[torch.Tensor],
+    ):
+        bsz, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+
+        self.cache_k = self.cache_k.to(xq)
+        self.cache_v = self.cache_v.to(xq)
+
+        self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
+        self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
+
+        keys = self.cache_k[:bsz, : start_pos + seqlen]
+        values = self.cache_v[:bsz, : start_pos + seqlen]
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        keys = repeat_kv(keys, self.n_rep)  # (bs, cache_len + seqlen, n_local_heads, head_dim)
+        values = repeat_kv(values, self.n_rep)  # (bs, cache_len + seqlen, n_local_heads, head_dim)
+
+        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        keys = keys.transpose(1, 2)  # (bs, n_local_heads, cache_len + seqlen, head_dim)
+        values = values.transpose(1, 2)  # (bs, n_local_heads, cache_len + seqlen, head_dim)
+        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if mask is not None:
+            scores = scores + mask  # (bs, n_local_heads, seqlen, cache_len + seqlen)
+        scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+        output = torch.matmul(scores, values)  # (bs, n_local_heads, seqlen, head_dim)
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        return self.wo(output)
+
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = ColumnParallelLinear(dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x)
+        self.w2 = RowParallelLinear(hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x)
+        self.w3 = ColumnParallelLinear(dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x)
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, args: ModelArgs):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            ffn_dim_multiplier=args.ffn_dim_multiplier,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        start_pos: int,
+        freqs_cis: torch.Tensor,
+        mask: Optional[torch.Tensor],
+    ):
+        h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask)
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+
+
+class Transformer(nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+
+        self.tok_embeddings = VocabParallelEmbedding(params.vocab_size, params.dim, init_method=lambda x: x)
+
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(params.n_layers):
+            self.layers.append(TransformerBlock(layer_id, params))
+
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = ColumnParallelLinear(params.dim, params.vocab_size, bias=False, init_method=lambda x: x)
+
+        self.freqs_cis = precompute_freqs_cis(
+            params.dim // params.n_heads,
+            params.max_seq_len * 2,
+            params.rope_theta,
+            params.use_scaled_rope,
+        )
+
+    @torch.inference_mode()
+    def forward(self, tokens: torch.Tensor, start_pos: int):
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+        self.freqs_cis = self.freqs_cis.to(h.device)
+        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
+
+        mask = None
+        if seqlen > 1:
+            mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device)
+
+            mask = torch.triu(mask, diagonal=1)
+
+            # https://github.com/pytorch/pytorch/issues/100005
+            # torch.triu is buggy when the device is mps: filled values are
+            # nan instead of 0.
+            if mask.device.type == torch.device("mps").type:
+                mask = torch.nan_to_num(mask, nan=0.0)
+
+            # When performing key-value caching, we compute the attention scores
+            # only for the new sequence. Thus, the matrix of scores is of size
+            # (seqlen, cache_len + seqlen), and the only masked entries are (i, j) for
+            # j > cache_len + i, since row i corresponds to token cache_len + i.
+            mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device), mask]).type_as(h)
+
+        for layer in self.layers:
+            h = layer(h, start_pos, freqs_cis, mask)
+        h = self.norm(h)
+        output = self.output(h).float()
+        return output
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/init.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/init.py
@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# top-level folder for each specific model found within the models/ directory at
+# the top-level of this source tree.
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/encoder_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/encoder_utils.py
@ -0,0 +1,179 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# top-level folder for each specific model found within the models/ directory at
+# the top-level of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+import math
+from logging import getLogger
+
+import torch
+import torch.nn.functional as F
+
+from .utils import get_negative_inf_value, to_2tuple
+
+logger = getLogger()
+
+
+def resize_local_position_embedding(orig_pos_embed, grid_size):
+    """
+    Resize position embedding for vision encoder.
+    Original position embedding is [n_tiles * n_tiles + 1, dim]
+    New position embedding will be [grid_size[0] * grid_size[1] + 1, dim]
+    """
+    new_grid_size = to_2tuple(grid_size)
+    orig_grid_size = to_2tuple(int(math.sqrt(len(orig_pos_embed) - 1)))
+
+    new_pos_emb_tok, new_pos_emb_img = (
+        orig_pos_embed[:1],
+        orig_pos_embed[1:],
+    )
+    logger.info(f"resizing position embedding grid-size from {orig_grid_size} to {new_grid_size}")
+
+    new_pos_emb_img = new_pos_emb_img.reshape(1, orig_grid_size[0], orig_grid_size[1], -1).permute(0, 3, 1, 2)
+
+    new_pos_emb_img = F.interpolate(
+        new_pos_emb_img,
+        size=new_grid_size,
+        mode="bilinear",
+        align_corners=True,
+    )
+    new_pos_emb_img = new_pos_emb_img.permute(0, 2, 3, 1).reshape(1, new_grid_size[0] * new_grid_size[1], -1)[0]
+    new_pos_embed = torch.cat([new_pos_emb_tok, new_pos_emb_img], dim=0)
+    return new_pos_embed
+
+
+def initialize_global_position_embedding_from_local(pos_and_cls_embed, grid_size, x_scale, y_scale):
+    """
+    Takes a local position embedding for vision encoder and uses it
+    to initialize the global position embedding.
+    Input: local position embedding of shape [grid_size[0] * grid_size[1] + 1, dim]
+    Returns: global position embedding of shape [x_scale, y_scale, grid_size[0] * grid_size[1] + 1, dim]
+    Here x_scale and y_scale are the number of tiles along x-axis and y-axis respectively.
+    """
+    pos_embed = pos_and_cls_embed[1:]
+    cls_embed = pos_and_cls_embed[0].view(1, 1, 1, -1)
+    grid_size = to_2tuple(grid_size)
+    new_pos_emb_img = pos_embed.reshape(1, grid_size[0], grid_size[1], -1).permute(0, 3, 1, 2)
+    new_grid_size = (x_scale * grid_size[0], y_scale * grid_size[1])
+    new_pos_emb_img = F.interpolate(
+        new_pos_emb_img,
+        size=new_grid_size,
+        mode="bilinear",
+        align_corners=True,
+    )
+    new_pos_emb_img = new_pos_emb_img.permute(0, 2, 3, 1)
+    new_pos_emb_img = new_pos_emb_img.view(x_scale, grid_size[0], y_scale, grid_size[1], -1)
+    new_pos_emb_img = new_pos_emb_img.permute(0, 2, 1, 3, 4).contiguous()
+    new_pos_emb_img = new_pos_emb_img.reshape(x_scale, y_scale, grid_size[0] * grid_size[1], -1)
+    cls_embed = cls_embed.expand(x_scale, y_scale, -1, -1)
+    pos_and_cls_embed = torch.cat([cls_embed, new_pos_emb_img], dim=2)
+    return pos_and_cls_embed
+
+
+def resize_global_position_embedding(pos_and_cls_embed, grid_size, x_scale, y_scale):
+    """
+    Takes a global position embedding for vision encoder and resizes it to new size.
+    Input: global position embedding of shape [x_old, y_old, old_grid_size[0] * old_grid_size[1] + 1, dim]
+    Returns: global position embedding of shape [x_scale, y_scale, grid_size[0] * grid_size[1] + 1, dim]
+    Here x_scale and y_scale are the number of tiles along x-axis and y-axis respectively.
+    """
+    # first remove cls token
+    pos_embed = pos_and_cls_embed[:, :, 1:]
+    cls_embed = pos_and_cls_embed[:, :, 0].unsqueeze(2)
+
+    xs_old, ys_old, ntok, dim = pos_embed.shape
+    old_grid_size = int(math.sqrt(ntok))
+
+    # move to correct form for interpolation
+    pos_embed = pos_embed.view(xs_old, ys_old, old_grid_size, old_grid_size, dim)
+    pos_embed = pos_embed.permute(0, 2, 1, 3, 4).contiguous()
+    pos_embed = pos_embed.view(xs_old * old_grid_size, ys_old * old_grid_size, dim)
+    pos_embed = pos_embed.unsqueeze(0)
+
+    # interpolate
+    new_size = (grid_size[0] * x_scale, grid_size[1] * y_scale)
+    pos_embed = pos_embed.permute(0, 3, 1, 2)
+    pos_embed_resized = F.interpolate(
+        pos_embed,
+        size=new_size,
+        mode="bilinear",
+        align_corners=True,
+    )
+    pos_embed = pos_embed_resized.permute(0, 2, 3, 1)[0]
+
+    # move it back in place
+    pos_embed = pos_embed.view(x_scale, grid_size[0], y_scale, grid_size[1], dim)
+    pos_embed = pos_embed.permute(0, 2, 1, 3, 4).contiguous()
+    pos_embed = pos_embed.view(x_scale, y_scale, grid_size[0] * grid_size[1], dim)
+
+    # interpolate cls token
+    cls_embed = cls_embed.permute(2, 3, 0, 1)
+    cls_embed_resized = F.interpolate(
+        cls_embed,
+        size=(x_scale, y_scale),
+        mode="bilinear",
+        align_corners=True,
+    )
+    cls_embed = cls_embed_resized.permute(2, 3, 0, 1)
+    # add cls token back in
+    pos_and_cls_embed = torch.cat([cls_embed, pos_embed], dim=2)
+
+    return pos_and_cls_embed
+
+
+def build_encoder_attention_mask(
+    x: torch.Tensor,
+    ar: torch.Tensor,
+    ntok: int,
+    num_chunks: int,
+    n_heads: int,
+):
+    """
+    Build vision encoder attention mask that omits padding tokens.
+    """
+    masks = []
+    for arx in ar:
+        mask_i = torch.ones((num_chunks, x.shape[2], 1), dtype=x.dtype)
+        mask_i[: arx[0] * arx[1], :ntok] = 0
+        mask_i = mask_i.view(num_chunks * x.shape[2], -1)
+        mask_i = mask_i @ mask_i.T * get_negative_inf_value(x.dtype)
+        mask_i = mask_i.unsqueeze(0)
+        masks.append(mask_i)
+    masks = torch.stack(masks).to(x.device).expand(-1, n_heads, -1, -1)
+    return masks
+
+
+def expand_num_tokens_to_mult8(x):
+    num_pad_tokens = 8 - (x.shape[-2] % 8)
+    if num_pad_tokens == 0:
+        return x, 0
+    else:
+        return (
+            torch.cat(
+                [
+                    x,
+                    torch.zeros(
+                        (x.shape[0], x.shape[1], num_pad_tokens, x.shape[-1]),
+                        dtype=x.dtype,
+                        device=x.device,
+                    ),
+                ],
+                dim=-2,
+            ),
+            num_pad_tokens,
+        )
+
+
+def contract_num_tokens_from_mult8(x, num_pad_tokens):
+    if num_pad_tokens == 0:
+        return x
+    return x[:, :, :-num_pad_tokens]
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/image_transform.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/image_transform.py
@ -0,0 +1,408 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# top-level folder for each specific model found within the models/ directory at
+# the top-level of this source tree.
+
+import math
+from collections import defaultdict
+from logging import getLogger
+from typing import Any, Optional, Set, Tuple
+
+import torch
+import torchvision.transforms as tv
+from PIL import Image
+from torchvision.transforms import functional as F
+
+IMAGE_RES = 224
+
+logger = getLogger()
+
+
+class VariableSizeImageTransform(object):
+    """
+    This class accepts images of any size and dynamically resize, pads and chunks it
+    based on the image aspect ratio and the number of image chunks we allow.
+
+    The algorithm will NOT distort the image fit a certain aspect ratio, because
+    that leads to a significant degradation in image quality.
+
+    It can be summarized in 6 steps:
+    1. Find all possible canvas combinations of max_num_chunks;
+    2. Find the best canvas to fit the image;
+    3. Resize without distortion
+    4. Pad
+    5. Normalize
+    6. Chunk
+
+    For example, if an input image is of size 300x800, patch_size of 224,
+    and max_num_chunks = 8, it will find the closest aspect ratio that
+    is allowed within 8 image chunks, with some restrictions.
+    In this case, 2:4 = 2 horizontal patches and 4 vertical patches,
+    giving a total of 8 chunks.
+
+    If resize_to_max_canvas, the image will be resized (without distortion),
+    to the largest possible resolution. In this case, 388:896, and padded to 448:896,
+    where we maintain the original aspect ratio and pad with zeros value for the rest.
+    This approach minimizes the amount of padding required for any arbitrary resolution.
+
+    However, if limit_upscaling_to_patch_size is set to True,
+    the upscaling will be limited to the patch size. In the example above,
+    the image would remain 300x800 (no upscaling), and then padded to 448:896.
+
+    The final output will therefore be of shape (8, 3, 224, 224), where 2x4
+    patches are coming from the resizing and chunking.
+    """
+
+    def __init__(self, size: int = IMAGE_RES) -> None:
+        self.size = size
+        logger.info(f"VariableSizeImageTransform size: {self.size}")
+        self.to_tensor = tv.ToTensor()
+        self._mean = (0.48145466, 0.4578275, 0.40821073)
+        self._std = (0.26862954, 0.26130258, 0.27577711)
+        self.normalize = tv.Normalize(
+            mean=self._mean,
+            std=self._std,
+            inplace=True,
+        )
+        self.resample = tv.InterpolationMode.BILINEAR
+
+    @staticmethod
+    def get_factors(n: int) -> Set[int]:
+        """
+        Calculate all factors of a given number, i.e. a dividor that leaves
+        no remainder. For example, if n=12, it will return {1, 2, 3, 4, 6, 12}.
+
+        Args:
+            n (int): The number to find factors for.
+
+        Returns:
+            set: A set containing all factors of the number.
+        """
+        factors_set = set()
+
+        for i in range(1, int(n**0.5) + 1):
+            if n % i == 0:
+                factors_set.add(i)
+                factors_set.add(n // i)
+        return factors_set
+
+    def find_supported_resolutions(self, max_num_chunks: int, patch_size: int) -> torch.Tensor:
+        """
+        Computes all of the allowed resoltuions for a fixed number of chunks
+        and patch_size. Useful for when dividing an image into chunks.
+
+        Args:
+            max_num_chunks (int): Maximum number of chunks for processing.
+            patch_size (int): Size of the side of the patch.
+
+        Returns:
+            torch.Tensor: List of possible resolutions as tuples (height, width).
+
+        Example:
+            >>> max_num_chunks = 5
+            >>> patch_size = 224
+            >>> find_supported_resolutions(max_num_chunks, patch_size)
+            tensor([(224, 896), (448, 448), (224, 224), (896, 224), (224, 672),
+            (672, 224), (224, 448), (448, 224)])
+
+            Given max_num_chunks=4, patch_size=224, it will create a dictionary:
+            {
+            0.25: [(1, 4)],
+            1.0: [(2, 2), (1, 1)],
+            4.0: [(4, 1)],
+            0.33: [(1, 3)],
+            3.0: [(3, 1)],
+            0.5: [(1, 2)],
+            2.0: [(2, 1)]
+            }
+
+            and return the resolutions multiplied by the patch_size:
+            [(1*224, 4*224), (2*224, 2*224), ..., (2*224, 1*224)]
+        """
+        asp_dict = defaultdict(list)
+        for chunk_size in range(max_num_chunks, 0, -1):
+            _factors = sorted(self.get_factors(chunk_size))
+            _asp_ratios = [(factor, chunk_size // factor) for factor in _factors]
+            for height, width in _asp_ratios:
+                ratio_float = height / width
+                asp_dict[ratio_float].append((height, width))
+
+        # get the resolutions multiplied by the patch_size
+        possible_resolutions = []
+        for value in asp_dict.values():
+            for height, depth in value:
+                possible_resolutions.append((height * patch_size, depth * patch_size))
+
+        return possible_resolutions
+
+    @staticmethod
+    def get_max_res_without_distortion(
+        image_size: Tuple[int, int],
+        target_size: Tuple[int, int],
+    ) -> Tuple[int, int]:
+        """
+        Determines the maximum resolution to which an image can be resized to without distorting its
+        aspect ratio, based on the target resolution.
+
+        Args:
+            image_size (Tuple[int, int]): The original resolution of the image (height, width).
+            target_resolution (Tuple[int, int]): The desired resolution to fit the image into (height, width).
+        Returns:
+            Tuple[int, int]: The optimal dimensions (height, width) to which the image should be resized.
+        Example:
+            >>> _get_max_res_without_distortion([200, 300], target_size = [450, 200])
+            (134, 200)
+            >>> _get_max_res_without_distortion([800, 600], target_size = [450, 1300])
+            (450, 338)
+        """
+
+        original_width, original_height = image_size
+        target_width, target_height = target_size
+
+        scale_w = target_width / original_width
+        scale_h = target_height / original_height
+
+        if scale_w < scale_h:
+            new_width = target_width
+            new_height = min(math.floor(original_height * scale_w), target_height)
+        else:
+            new_height = target_height
+            new_width = min(math.floor(original_width * scale_h), target_width)
+
+        return new_width, new_height
+
+    def _pad(self, image: Image.Image, target_size) -> Image.Image:
+        new_width, new_height = target_size
+        new_im = Image.new(mode="RGB", size=(new_width, new_height), color=(0, 0, 0))  # type: ignore
+        new_im.paste(image)
+        return new_im
+
+    def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
+        # Split image into number of required tiles (width x height)
+        num_channels, height, width = image.size()
+        image = image.view(num_channels, nch, height // nch, ncw, width // ncw)
+        # Permute dimensions to reorder the axes
+        image = image.permute(1, 3, 0, 2, 4).contiguous()
+        # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
+        image = image.view(ncw * nch, num_channels, height // nch, width // ncw)
+        return image
+
+    def resize_without_distortion(
+        self,
+        image: torch.Tensor,
+        target_size: Tuple[int, int],
+        max_upscaling_size: Optional[int],
+    ) -> torch.Tensor:
+        """
+        Used to resize an image to target_resolution, without distortion.
+
+        If target_size requires upscaling the image, the user can set max_upscaling_size to
+        limit the upscaling to a maximum size. In this case, since we rescale without distortion,
+        modifying target_size works as a boundary for the image's largest side.
+
+        Args:
+            resample (str): Resampling method used when resizing images.
+                Supports "nearest", "nearest_exact", "bilinear", "bicubic".
+            max_upscaling_size (int): The maximum size to upscale the image to.
+                If None, there is no limit.
+        Examples:
+        >>> target_size = (1000, 1200)
+        >>> max_upscaling_size = 600
+        >>> image_size = (400, 200)
+        >>> resize_without_distortion(image_size, target_size, max_upscaling_size)
+        (600, 300)  # new_size_without_distortion
+
+        >>> target_size = (1000, 1200)
+        >>> max_upscaling_size = 600
+        >>> image_size = (2000, 200)
+        >>> resize_without_distortion(image_size, target_size, max_upscaling_size)
+        (1000, 100)  # new_size_without_distortion
+
+        >>> target_size = (1000, 1200)
+        >>> max_upscaling_size = 2000
+        >>> image_size = (400, 200)
+        >>> resize_without_distortion(image_size, target_size, max_upscaling_size)
+        (1000, 500)  # new_size_without_distortion
+
+        >>> target_size = (1000, 1200)
+        >>> max_upscaling_size = None
+        >>> image_size = (400, 200)
+        >>> resize_without_distortion(image_size, target_size, max_upscaling_size)
+        (1000, 500)  # new_size_without_distortion
+        """
+
+        image_width, image_height = image.size
+        image_size = (image_width, image_height)
+
+        # If target_size requires upscaling, we might want to limit the upscaling to max_upscaling_size
+        if max_upscaling_size is not None:
+            new_target_width = min(max(image_width, max_upscaling_size), target_size[0])
+            new_target_height = min(max(image_height, max_upscaling_size), target_size[1])
+            target_size = (new_target_width, new_target_height)
+
+        # resize to target_size while preserving aspect ratio
+        new_size_without_distortion = self.get_max_res_without_distortion(image_size, target_size)
+
+        image = F.resize(
+            image,
+            (new_size_without_distortion[1], new_size_without_distortion[0]),
+            interpolation=self.resample,
+        )
+
+        return image
+
+    def get_best_fit(
+        self,
+        image_size: Tuple[int, int],
+        possible_resolutions: torch.Tensor,
+        resize_to_max_canvas: bool = False,
+    ) -> Tuple[int, int]:
+        """
+        Determines the best canvas possible from a list of possible resolutions to, without distortion,
+        resize an image to.
+
+        For each possible resolution, calculates the scaling factors for
+        width and height, and selects the smallest one, which is the limiting side.
+        E.g. to match the canvas you can upscale height by 2x, and width by 1.5x,
+        therefore, the maximum upscaling you can do is min(2, 1.5) = 1.5.
+
+        If upscaling is possible (any of the scaling factors is greater than 1),
+        then picks the smallest upscaling factor > 1, unless resize_to_max_canvas is True.
+
+        If upscaling is not possible, then picks the largest scaling factor <= 1, i.e.
+        reduce downscaling as much as possible.
+
+        If there are multiple resolutions with the same max scale, we pick the one with the lowest area,
+        to minimize padding. E.g., the same image can be upscaled to 224x224 and 224x448, but the latter
+        has more padding.
+
+        Args:
+            image_size (Tuple[int, int]): A tuple containing the height and width of the image.
+            possible_resolutions (torch.Tensor): A tensor of shape (N, 2) where each
+                row represents a possible resolution (height, width).
+            use_max_upscaling (bool): If True, will return the largest upscaling resolution.
+
+        Returns:
+            List[int]: The best resolution [height, width] for the given image.
+
+        Example:
+            >>> image_size = (200, 300)
+            >>> possible_resolutions = torch.tensor([[224, 672],
+            ...                                     [672, 224],
+            ...                                     [224, 448],
+            ...                                     [448, 224],
+            ...                                     [224, 224]])
+            >>> _get_smallest_upscaling_possibility(image_size, possible_resolutions)
+            [224, 448]
+
+            We have:
+                scale_w = tensor([2.2400, 0.7467, 1.4933, 0.7467, 0.7467])
+                scale_h = tensor([1.1200, 3.3600, 1.1200, 2.2400, 1.1200])
+                scales = tensor([1.1200, 0.7467, 1.1200, 0.7467, 0.7467])
+            Only one of the scales > 1:
+                upscaling_possible = tensor([1.1200, 1.1200])
+                smallest_rescale = tensor(1.1200)
+            So we pick the resolution with the smallest smallest area:
+                areas = tensor([150528, 100352]) # [672, 224], [224, 448]
+                optimal_canvas = tensor([224, 448])
+        """
+
+        original_width, original_height = image_size
+
+        # get all possible resolutions heights/widths
+        target_widths, target_heights = (
+            possible_resolutions[:, 0],
+            possible_resolutions[:, 1],
+        )
+
+        # get scaling factors to resize the image without distortion
+        scale_w = target_widths / original_width
+        scale_h = target_heights / original_height
+
+        # get the min scale between width and height (limiting side -> no distortion)
+        scales = torch.where(scale_w > scale_h, scale_h, scale_w)
+
+        # filter only scales that allow upscaling
+        upscaling_options = scales[scales >= 1]
+        if len(upscaling_options) > 0:
+            if resize_to_max_canvas:
+                selected_scale = torch.max(upscaling_options)
+            else:
+                selected_scale = torch.min(upscaling_options)
+        else:
+            # no upscaling possible,
+            # get the minimum downscaling (max scale for scales<1)
+            downscaling_options = scales[scales < 1]
+            selected_scale = torch.max(downscaling_options)
+
+        # get all resolutions that support this scaling factor,
+        # e.g. you can upscale to 224x224, 224x448, 224x672 without distortion
+        chosen_canvas = possible_resolutions[scales == selected_scale]
+
+        # if there are multiple resolutions,
+        # get the one with minimum area to reduce padding
+        if len(chosen_canvas) > 1:
+            areas = chosen_canvas[:, 0] * chosen_canvas[:, 1]
+            optimal_idx = torch.argmin(areas)
+            optimal_canvas = chosen_canvas[optimal_idx]
+        else:
+            optimal_canvas = chosen_canvas[0]
+
+        return tuple(optimal_canvas.tolist())
+
+    def __call__(
+        self,
+        image: Image.Image,
+        max_num_chunks: int,
+        normalize_img: bool = True,
+        resize_to_max_canvas: bool = False,
+    ) -> Tuple[Any, Any]:
+        """
+        Args:
+            image (PIL.Image): Image to be resized.
+            max_num_chunks (int): Maximum number of chunks to split the image into.
+            normalize_img (bool): Whether to normalize the image.
+            resize_to_max_canvas (bool): Whether to resize the image to the maximum canvas size.
+            If True, picks the canvas the allows the largest resizing without distortion.
+            If False, downsample as little as possible, including no resizing at all,
+            but never upsample, unless the image is smaller than the patch size.
+        """
+        assert max_num_chunks > 0
+        assert isinstance(image, Image.Image), type(image)
+        w, h = image.size
+
+        possible_resolutions = self.find_supported_resolutions(max_num_chunks=max_num_chunks, patch_size=self.size)
+        possible_resolutions = torch.tensor(possible_resolutions)
+
+        best_resolution = self.get_best_fit(
+            image_size=(w, h),
+            possible_resolutions=possible_resolutions,
+            resize_to_max_canvas=resize_to_max_canvas,
+        )
+
+        max_upscaling_size = None if resize_to_max_canvas else self.size
+        image = self.resize_without_distortion(image, best_resolution, max_upscaling_size)
+        image = self._pad(image, best_resolution)
+
+        image = self.to_tensor(image)
+
+        if normalize_img:
+            image = self.normalize(image)
+
+        ratio_w, ratio_h = (
+            best_resolution[0] // self.size,
+            best_resolution[1] // self.size,
+        )
+
+        image = self._split(image, ratio_w, ratio_h)  # type: ignore
+
+        ar = (ratio_h, ratio_w)
+        return image, ar
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/model.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/model.py
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/utils.py
@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# top-level folder for each specific model found within the models/ directory at
+# the top-level of this source tree.
+
+import collections
+
+import torch
+
+
+def get_negative_inf_value(dtype):
+    return torch.finfo(dtype).min
+
+
+def to_2tuple(x):
+    if isinstance(x, collections.abc.Iterable):
+        return x
+    return (x, x)
--- a/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
+++ b/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
@ -9,18 +9,18 @@ from copy import deepcopy
 from functools import partial
 from typing import Any, Generator

-from llama_models.llama3.api.chat_format import ChatFormat
-from llama_models.llama3.api.tokenizer import Tokenizer
-
 from llama_stack.models.llama.datatypes import Model
+from llama_stack.models.llama.llama3.chat_format import ChatFormat
+from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.providers.utils.inference.prompt_adapter import (
    ChatCompletionRequestWithRawContent,
    CompletionRequestWithRawContent,
 )

+from .common import model_checkpoint_dir
 from .config import MetaReferenceInferenceConfig
-from .generation import Llama, model_checkpoint_dir
+from .llama3.generation import Llama3
 from .parallel_utils import ModelParallelProcessGroup


@ -43,7 +43,7 @@ def init_model_cb(
    model_id: str,
    llama_model: Model,
 ):
-    llama = Llama.build(config, model_id, llama_model)
+    llama = Llama3.build(config, model_id, llama_model)
    return ModelRunner(llama)


--- a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
@ -10,6 +10,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import copy
 import json
 import logging
 import multiprocessing
@ -36,7 +37,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
    CompletionRequestWithRawContent,
 )

-from .generation import TokenResult
+from .common import TokenResult

 log = logging.getLogger(__name__)

@ -207,13 +208,13 @@ def maybe_parse_message(maybe_json: Optional[str]) -> Optional[ProcessingMessage
        return parse_message(maybe_json)
    except json.JSONDecodeError:
        return None
-    except ValueError as e:
+    except ValueError:
        return None


 def parse_message(json_str: str) -> ProcessingMessage:
    data = json.loads(json_str)
-    return ProcessingMessageWrapper(**data).payload
+    return copy.deepcopy(ProcessingMessageWrapper(**data).payload)


 def worker_process_entrypoint(
@ -231,7 +232,7 @@ def worker_process_entrypoint(
    while True:
        try:
            task = req_gen.send(result)
-            if isinstance(task, str) and task == _END_SENTINEL:
+            if isinstance(task, str) and task == EndSentinel():
                break

            assert isinstance(task, TaskRequest)
@ -352,7 +353,7 @@ class ModelParallelProcessGroup:
                if isinstance(obj, TaskResponse):
                    yield obj.result

-        except GeneratorExit as e:
+        except GeneratorExit:
            self.request_socket.send(encode_msg(CancelSentinel()))
            while True:
                obj_json = self.request_socket.send()
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/fp8_txest_disabled.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/fp8_txest_disabled.py
@ -7,6 +7,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.

+# The file gets a special treatment for now?
+# ruff: noqa: N803
+
 import unittest

 import torch
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/loader.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/loader.py
@ -12,10 +12,9 @@ import os
 from typing import Any, Dict, List, Optional

 import torch
+from fairscale.nn.model_parallel.initialize import get_model_parallel_rank
 from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_region
-from llama_models.llama3.api.args import ModelArgs
-from llama_models.llama3.reference_impl.model import Transformer, TransformerBlock
 from torch import Tensor, nn
 from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear

@ -23,6 +22,8 @@ from llama_stack.apis.inference import QuantizationType
 from llama_stack.models.llama.datatypes import CheckpointQuantizationFormat
 from llama_stack.models.llama.sku_list import resolve_model

+from ...llama3.args import ModelArgs
+from ...llama3.model import Transformer, TransformerBlock
 from ..config import MetaReferenceQuantizedInferenceConfig

 log = logging.getLogger(__name__)
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/quantize_checkpoint.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/quantize_checkpoint.py
@ -22,11 +22,11 @@ from fairscale.nn.model_parallel.initialize import (
    initialize_model_parallel,
    model_parallel_is_initialized,
 )
-from llama_models.llama3.api.args import ModelArgs
-from llama_models.llama3.api.tokenizer import Tokenizer
-from llama_models.llama3.reference_impl.model import Transformer, TransformerBlock
 from torch.nn.parameter import Parameter

+from llama_stack.models.llama.llama3.tokenizer import Tokenizer
+from llama_stack.providers.inline.inference.meta_reference.llama3.args import ModelArgs
+from llama_stack.providers.inline.inference.meta_reference.llama3.model import Transformer, TransformerBlock
 from llama_stack.providers.inline.inference.meta_reference.quantization.fp8_impls import (
    quantize_fp8,
 )
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/run_quantize_checkpoint.sh
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/run_quantize_checkpoint.sh
@ -21,7 +21,7 @@ NPROC=$7

 echo $MASTER_HOST, $RUN_ID, $CKPT_DIR, $QUANT_CKPT_DIR

-NCCL_NET=Socket NCCL_SOCKET_IFNAME=eth TIKTOKEN_CACHE_DIR="" PYTHONPATH="/home/$USER/llama-models:/home/$USER/llama-stack" \
+NCCL_NET=Socket NCCL_SOCKET_IFNAME=eth TIKTOKEN_CACHE_DIR="" PYTHONPATH="/home/$USER/llama-stack" \
  torchrun \
   --nnodes=$NNODES --nproc_per_node=$NPROC \
   --rdzv_id=$RUN_ID \
--- a/llama_stack/providers/inline/inference/sentence_transformers/init.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/init.py
@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from llama_stack.providers.inline.inference.sentence_transformers.config import (
    SentenceTransformersInferenceConfig,
 )
@ -11,7 +13,7 @@ from llama_stack.providers.inline.inference.sentence_transformers.config import

 async def get_provider_impl(
    config: SentenceTransformersInferenceConfig,
-    _deps,
+    _deps: Dict[str, Any],
 ):
    from .sentence_transformers import SentenceTransformersInferenceImpl

--- a/llama_stack/providers/inline/inference/sentence_transformers/config.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/config.py
@ -11,5 +11,5 @@ from pydantic import BaseModel

 class SentenceTransformersInferenceConfig(BaseModel):
    @classmethod
-    def sample_run_config(cls) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
        return {}
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -53,7 +53,7 @@ class SentenceTransformersInferenceImpl(
        self,
        model_id: str,
        content: str,
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        sampling_params: Optional[SamplingParams] = None,
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
@ -64,7 +64,7 @@ class SentenceTransformersInferenceImpl(
        self,
        model_id: str,
        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        sampling_params: Optional[SamplingParams] = None,
        response_format: Optional[ResponseFormat] = None,
        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
--- a/llama_stack/providers/inline/inference/vllm/init.py
+++ b/llama_stack/providers/inline/inference/vllm/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any
+from typing import Any, Dict

 from .config import VLLMConfig


-async def get_provider_impl(config: VLLMConfig, _deps) -> Any:
+async def get_provider_impl(config: VLLMConfig, _deps: Dict[str, Any]):
    from .vllm import VLLMInferenceImpl

    impl = VLLMInferenceImpl(config)
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@ -4,20 +4,21 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from pydantic import BaseModel, Field, field_validator
+from typing import Any, Dict
+
+from pydantic import BaseModel, Field

-from llama_stack.providers.utils.inference import supported_inference_models
 from llama_stack.schema_utils import json_schema_type


@json_schema_type
 class VLLMConfig(BaseModel):
-    """Configuration for the vLLM inference provider."""
+    """Configuration for the vLLM inference provider.
+
+    Note that the model name is no longer part of this static configuration.
+    You can bind an instance of this provider to a specific model with the
+    ``models.register()`` API call."""

-    model: str = Field(
-        default="Llama3.2-3B-Instruct",
-        description="Model descriptor from `llama model list`",
-    )
    tensor_parallel_size: int = Field(
        default=1,
        description="Number of tensor parallel replicas (number of GPUs to use).",
@ -26,32 +27,27 @@ class VLLMConfig(BaseModel):
        default=4096,
        description="Maximum number of tokens to generate.",
    )
+    max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.")
+    max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation.")
    enforce_eager: bool = Field(
        default=False,
        description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
    )
    gpu_memory_utilization: float = Field(
        default=0.3,
+        description=(
+            "How much GPU memory will be allocated when this provider has finished "
+            "loading, including memory that was already allocated before loading."
+        ),
    )

    @classmethod
-    def sample_run_config(cls):
+    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
        return {
-            "model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
            "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
            "max_tokens": "${env.MAX_TOKENS:4096}",
+            "max_model_len": "${env.MAX_MODEL_LEN:4096}",
+            "max_num_seqs": "${env.MAX_NUM_SEQS:4}",
            "enforce_eager": "${env.ENFORCE_EAGER:False}",
-            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
+            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.3}",
        }
-
-    @field_validator("model")
-    @classmethod
-    def validate_model(cls, model: str) -> str:
-        permitted_models = supported_inference_models()
-
-        descriptors = [m.descriptor() for m in permitted_models]
-        repos = [m.huggingface_repo for m in permitted_models]
-        if model not in (descriptors + repos):
-            model_list = "\n\t".join(repos)
-            raise ValueError(f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]")
-        return model
--- a/llama_stack/providers/inline/inference/vllm/openai_utils.py
+++ b/llama_stack/providers/inline/inference/vllm/openai_utils.py
@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List, Optional
+
+import vllm
+
+from llama_stack.apis.inference import (
+    ChatCompletionRequest,
+    GrammarResponseFormat,
+    JsonSchemaResponseFormat,
+    Message,
+    ToolChoice,
+    UserMessage,
+)
+from llama_stack.models.llama.datatypes import BuiltinTool, ToolDefinition
+from llama_stack.providers.utils.inference.openai_compat import (
+    convert_message_to_openai_dict,
+    get_sampling_options,
+)
+
+###############################################################################
+# This file contains OpenAI compatibility code that is currently only used
+# by the inline vLLM connector. Some or all of this code may be moved to a
+# central location at a later date.
+
+
+def _merge_context_into_content(message: Message) -> Message:  # type: ignore
+    """
+    Merge the ``context`` field of a Llama Stack ``Message`` object into
+    the content field for compabilitiy with OpenAI-style APIs.
+
+    Generates a content string that emulates the current behavior
+    of ``llama_models.llama3.api.chat_format.encode_message()``.
+
+    :param message: Message that may include ``context`` field
+
+    :returns: A version of ``message`` with any context merged into the
+     ``content`` field.
+    """
+    if not isinstance(message, UserMessage):  # Separate type check for linter
+        return message
+    if message.context is None:
+        return message
+    return UserMessage(
+        role=message.role,
+        # Emumate llama_models.llama3.api.chat_format.encode_message()
+        content=message.content + "\n\n" + message.context,
+        context=None,
+    )
+
+
+def _llama_stack_tools_to_openai_tools(
+    tools: Optional[List[ToolDefinition]] = None,
+) -> List[vllm.entrypoints.openai.protocol.ChatCompletionToolsParam]:
+    """
+    Convert the list of available tools from Llama Stack's format to vLLM's
+    version of OpenAI's format.
+    """
+    if tools is None:
+        return []
+
+    result = []
+    for t in tools:
+        if isinstance(t.tool_name, BuiltinTool):
+            raise NotImplementedError("Built-in tools not yet implemented")
+        if t.parameters is None:
+            parameters = None
+        else:  # if t.parameters is not None
+            # Convert the "required" flags to a list of required params
+            required_params = [k for k, v in t.parameters.items() if v.required]
+            parameters = {
+                "type": "object",  # Mystery value that shows up in OpenAI docs
+                "properties": {
+                    k: {"type": v.param_type, "description": v.description} for k, v in t.parameters.items()
+                },
+                "required": required_params,
+            }
+
+        function_def = vllm.entrypoints.openai.protocol.FunctionDefinition(
+            name=t.tool_name, description=t.description, parameters=parameters
+        )
+
+        # Every tool definition is double-boxed in a ChatCompletionToolsParam
+        result.append(vllm.entrypoints.openai.protocol.ChatCompletionToolsParam(function=function_def))
+    return result
+
+
+async def llama_stack_chat_completion_to_openai_chat_completion_dict(
+    request: ChatCompletionRequest,
+) -> dict:
+    """
+    Convert a chat completion request in Llama Stack format into an
+    equivalent set of arguments to pass to an OpenAI-compatible
+    chat completions API.
+
+    :param request: Bundled request parameters in Llama Stack format.
+
+    :returns: Dictionary of key-value pairs to use as an initializer
+     for a dataclass or to be converted directly to JSON and sent
+     over the wire.
+    """
+
+    converted_messages = [
+        # This mystery async call makes the parent function also be async
+        await convert_message_to_openai_dict(_merge_context_into_content(m), download=True)
+        for m in request.messages
+    ]
+    converted_tools = _llama_stack_tools_to_openai_tools(request.tools)
+
+    # Llama will try to use built-in tools with no tool catalog, so don't enable
+    # tool choice unless at least one tool is enabled.
+    converted_tool_choice = "none"
+    if (
+        request.tool_config is not None
+        and request.tool_config.tool_choice == ToolChoice.auto
+        and request.tools is not None
+        and len(request.tools) > 0
+    ):
+        converted_tool_choice = "auto"
+
+    # TODO: Figure out what to do with the tool_prompt_format argument.
+    #  Other connectors appear to drop it quietly.
+
+    # Use Llama Stack shared code to translate sampling parameters.
+    sampling_options = get_sampling_options(request.sampling_params)
+
+    # get_sampling_options() translates repetition penalties to an option that
+    # OpenAI's APIs don't know about.
+    # vLLM's OpenAI-compatible API also handles repetition penalties wrong.
+    # For now, translate repetition penalties into a format that vLLM's broken
+    # API will handle correctly. Two wrongs make a right...
+    if "repeat_penalty" in sampling_options:
+        del sampling_options["repeat_penalty"]
+    if request.sampling_params.repetition_penalty is not None and request.sampling_params.repetition_penalty != 1.0:
+        sampling_options["repetition_penalty"] = request.sampling_params.repetition_penalty
+
+    # Convert a single response format into four different parameters, per
+    # the OpenAI spec
+    guided_decoding_options = dict()
+    if request.response_format is None:
+        # Use defaults
+        pass
+    elif isinstance(request.response_format, JsonSchemaResponseFormat):
+        guided_decoding_options["guided_json"] = request.response_format.json_schema
+    elif isinstance(request.response_format, GrammarResponseFormat):
+        guided_decoding_options["guided_grammar"] = request.response_format.bnf
+    else:
+        raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(request.response_format)}'")
+
+    logprob_options = dict()
+    if request.logprobs is not None:
+        logprob_options["logprobs"] = request.logprobs.top_k
+
+    # Marshall together all the arguments for a ChatCompletionRequest
+    request_options = {
+        "model": request.model,
+        "messages": converted_messages,
+        "tools": converted_tools,
+        "tool_choice": converted_tool_choice,
+        "stream": request.stream,
+        **sampling_options,
+        **guided_decoding_options,
+        **logprob_options,
+    }
+
+    return request_options
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@ -4,45 +4,71 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import logging
-import os
+import json
+import re
 import uuid
-from typing import AsyncGenerator, List, Optional
+from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional, Union

-from llama_models.llama3.api.tokenizer import Tokenizer
+# These vLLM modules contain names that overlap with Llama Stack names, so we import
+# fully-qualified names
+import vllm.entrypoints.openai.protocol
+import vllm.sampling_params
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams as VLLMSamplingParams
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels

-from llama_stack.apis.common.content_types import InterleavedContent
+from llama_stack.apis.common.content_types import (
+    InterleavedContent,
+    InterleavedContentItem,
+    TextDelta,
+    ToolCallDelta,
+)
 from llama_stack.apis.inference import (
    ChatCompletionRequest,
    ChatCompletionResponse,
+    ChatCompletionResponseEvent,
+    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
+    CompletionMessage,
    CompletionResponse,
    CompletionResponseStreamChunk,
    EmbeddingsResponse,
    EmbeddingTaskType,
+    GrammarResponseFormat,
    Inference,
-    InterleavedContentItem,
+    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
    ResponseFormat,
    SamplingParams,
    TextTruncation,
+    TokenLogProbs,
    ToolChoice,
    ToolConfig,
-    ToolDefinition,
-    ToolPromptFormat,
 )
 from llama_stack.apis.models import Model
-from llama_stack.models.llama.sku_list import resolve_model
-from llama_stack.providers.datatypes import ModelsProtocolPrivate
+from llama_stack.log import get_logger
+from llama_stack.models.llama import sku_list
+from llama_stack.models.llama.datatypes import (
+    StopReason,
+    ToolCall,
+    ToolDefinition,
+    ToolPromptFormat,
+    TopKSamplingStrategy,
+    TopPSamplingStrategy,
+)
+from llama_stack.models.llama.llama3.chat_format import ChatFormat
+from llama_stack.models.llama.llama3.tokenizer import Tokenizer
+from llama_stack.providers.remote.inference.vllm.vllm import build_hf_repo_model_entries
+from llama_stack.providers.utils.inference.model_registry import (
+    ModelRegistryHelper,
+    ModelsProtocolPrivate,
+)
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
-    get_sampling_options,
-    process_chat_completion_response,
+    get_stop_reason,
    process_chat_completion_stream_response,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
@ -50,188 +76,322 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 )

 from .config import VLLMConfig
+from .openai_utils import llama_stack_chat_completion_to_openai_chat_completion_dict

-log = logging.getLogger(__name__)
+# Map from Hugging Face model architecture name to appropriate tool parser.
+# See vllm.entrypoints.openai.tool_parsers.ToolParserManager.tool_parsers for the full list of
+# available parsers.
+# TODO: Expand this list
+CONFIG_TYPE_TO_TOOL_PARSER = {
+    "GraniteConfig": "granite",
+    "MllamaConfig": "llama3_json",
+    "LlamaConfig": "llama3_json",
+}
+DEFAULT_TOOL_PARSER = "pythonic"


-def _random_uuid() -> str:
+logger = get_logger(__name__, category="inference")
+
+
+def _random_uuid_str() -> str:
    return str(uuid.uuid4().hex)


+def _response_format_to_guided_decoding_params(
+    response_format: Optional[ResponseFormat],  # type: ignore
+) -> vllm.sampling_params.GuidedDecodingParams:
+    """
+    Translate constrained decoding parameters from Llama Stack's format to vLLM's format.
+
+    :param response_format: Llama Stack version of constrained decoding info. Can be ``None``,
+     indicating no constraints.
+    :returns: The equivalent dataclass object for the low-level inference layer of vLLM.
+    """
+    if response_format is None:
+        # As of vLLM 0.6.3, the default constructor for GuidedDecodingParams() returns an invalid
+        # value that crashes the executor on some code paths. Use ``None`` instead.
+        return None
+
+    # Llama Stack currently implements fewer types of constrained decoding than vLLM does.
+    # Translate the types that exist and detect if Llama Stack adds new ones.
+    if isinstance(response_format, JsonSchemaResponseFormat):
+        return vllm.sampling_params.GuidedDecodingParams(json=response_format.json_schema)
+    elif isinstance(response_format, GrammarResponseFormat):
+        # BNF grammar.
+        # Llama Stack uses the parse tree of the grammar, while vLLM uses the string
+        # representation of the grammar.
+        raise TypeError(
+            "Constrained decoding with BNF grammars is not currently implemented, because the "
+            "reference implementation does not implement it."
+        )
+    else:
+        raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(response_format)}'")
+
+
+def _convert_sampling_params(
+    sampling_params: Optional[SamplingParams],
+    response_format: Optional[ResponseFormat],  # type: ignore
+    log_prob_config: Optional[LogProbConfig],
+) -> vllm.SamplingParams:
+    """Convert sampling and constrained decoding configuration from Llama Stack's format to vLLM's
+    format."""
+    # In the absence of provided config values, use Llama Stack defaults as encoded in the Llama
+    # Stack dataclasses. These defaults are different from vLLM's defaults.
+    if sampling_params is None:
+        sampling_params = SamplingParams()
+    if log_prob_config is None:
+        log_prob_config = LogProbConfig()
+
+    if isinstance(sampling_params.strategy, TopKSamplingStrategy):
+        if sampling_params.strategy.top_k == 0:
+            # vLLM treats "k" differently for top-k sampling
+            vllm_top_k = -1
+        else:
+            vllm_top_k = sampling_params.strategy.top_k
+    else:
+        vllm_top_k = -1
+
+    if isinstance(sampling_params.strategy, TopPSamplingStrategy):
+        vllm_top_p = sampling_params.strategy.top_p
+        # Llama Stack only allows temperature with top-P.
+        vllm_temperature = sampling_params.strategy.temperature
+    else:
+        vllm_top_p = 1.0
+        vllm_temperature = 0.0
+
+    # vLLM allows top-p and top-k at the same time.
+    vllm_sampling_params = vllm.SamplingParams.from_optional(
+        max_tokens=(None if sampling_params.max_tokens == 0 else sampling_params.max_tokens),
+        temperature=vllm_temperature,
+        top_p=vllm_top_p,
+        top_k=vllm_top_k,
+        repetition_penalty=sampling_params.repetition_penalty,
+        guided_decoding=_response_format_to_guided_decoding_params(response_format),
+        logprobs=log_prob_config.top_k,
+    )
+    return vllm_sampling_params
+
+
 class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
-    """Inference implementation for vLLM."""
+    """
+    vLLM-based inference model adapter for Llama Stack with support for multiple models.
+
+    Requires the configuration parameters documented in the :class:`VllmConfig2` class.
+    """
+
+    config: VLLMConfig
+    register_helper: ModelRegistryHelper
+    model_ids: set[str]
+    resolved_model_id: str | None
+    engine: AsyncLLMEngine | None
+    chat: OpenAIServingChat | None
+    is_meta_llama_model: bool

    def __init__(self, config: VLLMConfig):
        self.config = config
+        logger.info(f"Config is: {self.config}")
+
+        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
+        self.formatter = ChatFormat(Tokenizer.get_instance())
+
+        # The following are initialized when paths are bound to this provider
+        self.resolved_model_id = None
+        self.model_ids = set()
        self.engine = None
+        self.chat = None
+        self.is_meta_llama_model = False

-    async def initialize(self):
-        log.info("Initializing vLLM inference provider.")
+    ###########################################################################
+    # METHODS INHERITED FROM IMPLICIT BASE CLASS.
+    # TODO: Make this class inherit from the new base class ProviderBase once that class exists.

-        # Disable usage stats reporting. This would be a surprising thing for most
-        # people to find out was on by default.
-        # https://docs.vllm.ai/en/latest/serving/usage_stats.html
-        if "VLLM_NO_USAGE_STATS" not in os.environ:
-            os.environ["VLLM_NO_USAGE_STATS"] = "1"
+    async def initialize(self) -> None:
+        """
+        Callback that is invoked through many levels of indirection during provider class
+        instantiation, sometime after when __init__() is called and before any model registration
+        methods or methods connected to a REST API are called.

-        model = resolve_model(self.config.model)
-        if model is None:
-            raise ValueError(f"Unknown model {self.config.model}")
+        It's not clear what assumptions the class can make about the platform's initialization
+        state here that can't be made during __init__(), and vLLM can't be started until we know
+        what model it's supposed to be serving, so nothing happens here currently.
+        """
+        pass

-        if model.huggingface_repo is None:
-            raise ValueError(f"Model {self.config.model} needs a huggingface repo")
-
-        # TODO -- there are a ton of options supported here ...
-        engine_args = AsyncEngineArgs(
-            model=model.huggingface_repo,
-            tokenizer=model.huggingface_repo,
-            tensor_parallel_size=self.config.tensor_parallel_size,
-            enforce_eager=self.config.enforce_eager,
-            gpu_memory_utilization=self.config.gpu_memory_utilization,
-            guided_decoding_backend="lm-format-enforcer",
-        )
-
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
-
-    async def shutdown(self):
-        """Shut down the vLLM inference adapter."""
-        log.info("Shutting down vLLM inference provider.")
-        if self.engine:
+    async def shutdown(self) -> None:
+        logger.info(f"Shutting down inline vLLM inference provider {self}.")
+        if self.engine is not None:
            self.engine.shutdown_background_loop()
+            self.engine = None
+            self.chat = None
+            self.model_ids = set()
+            self.resolved_model_id = None
+
+    ###########################################################################
+    # METHODS INHERITED FROM ModelsProtocolPrivate INTERFACE

    # Note that the return type of the superclass method is WRONG
    async def register_model(self, model: Model) -> Model:
        """
-        Callback that is called when the server associates an inference endpoint
-        with an inference provider.
+        Callback that is called when the server associates an inference endpoint with an
+        inference provider.

-        :param model: Object that encapsulates parameters necessary for identifying
-         a specific LLM.
+        :param model: Object that encapsulates parameters necessary for identifying a specific
+         LLM.

-        :returns: The input ``Model`` object. It may or may not be permissible
-         to change fields before returning this object.
+        :returns: The input ``Model`` object. It may or may not be permissible to change fields
+         before returning this object.
        """
-        log.info(f"Registering model {model.identifier} with vLLM inference provider.")
-        # The current version of this provided is hard-coded to serve only
-        # the model specified in the YAML config file.
-        configured_model = resolve_model(self.config.model)
-        registered_model = resolve_model(model.model_id)
+        logger.debug(f"In register_model({model})")
+
+        # First attempt to interpret the model coordinates as a Llama model name
+        resolved_llama_model = sku_list.resolve_model(model.provider_model_id)
+        if resolved_llama_model is not None:
+            # Load from Hugging Face repo into default local cache dir
+            model_id_for_vllm = resolved_llama_model.huggingface_repo
+
+            # Detect a genuine Meta Llama model to trigger Meta-specific preprocessing.
+            # Don't set self.is_meta_llama_model until we actually load the model.
+            is_meta_llama_model = True
+        else:  # if resolved_llama_model is None
+            # Not a Llama model name. Pass the model id through to vLLM's loader
+            model_id_for_vllm = model.provider_model_id
+            is_meta_llama_model = False
+
+        if self.resolved_model_id is not None:
+            if model_id_for_vllm != self.resolved_model_id:
+                raise ValueError(
+                    f"Attempted to serve two LLMs (ids '{self.resolved_model_id}') and "
+                    f"'{model_id_for_vllm}') from one copy of provider '{self}'. Use multiple "
+                    f"copies of the provider instead."
+                )
+            else:
+                # Model already loaded
+                logger.info(
+                    f"Requested id {model} resolves to {model_id_for_vllm}, which is already loaded. Continuing."
+                )
+                self.model_ids.add(model.model_id)
+                return model
+
+        logger.info(f"Requested id {model} resolves to {model_id_for_vllm}. Loading {model_id_for_vllm}.")
+        if is_meta_llama_model:
+            logger.info(f"Model {model_id_for_vllm} is a Meta Llama model.")
+        self.is_meta_llama_model = is_meta_llama_model
+
+        # If we get here, this is the first time registering a model.
+        # Preload so that the first inference request won't time out.
+        engine_args = AsyncEngineArgs(
+            model=model_id_for_vllm,
+            tokenizer=model_id_for_vllm,
+            tensor_parallel_size=self.config.tensor_parallel_size,
+            enforce_eager=self.config.enforce_eager,
+            gpu_memory_utilization=self.config.gpu_memory_utilization,
+            max_num_seqs=self.config.max_num_seqs,
+            max_model_len=self.config.max_model_len,
+        )
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+        # vLLM currently requires the user to specify the tool parser manually. To choose a tool
+        # parser, we need to determine what model architecture is being used. For now, we infer
+        # that information from what config class the model uses.
+        low_level_model_config = self.engine.engine.get_model_config()
+        hf_config = low_level_model_config.hf_config
+        hf_config_class_name = hf_config.__class__.__name__
+        if hf_config_class_name in CONFIG_TYPE_TO_TOOL_PARSER:
+            tool_parser = CONFIG_TYPE_TO_TOOL_PARSER[hf_config_class_name]
+        else:
+            # No info -- choose a default so we can at least attempt tool
+            # use.
+            tool_parser = DEFAULT_TOOL_PARSER
+        logger.debug(f"{hf_config_class_name=}")
+        logger.debug(f"{tool_parser=}")
+
+        # Wrap the lower-level engine in an OpenAI-compatible chat API
+        model_config = await self.engine.get_model_config()
+        self.chat = OpenAIServingChat(
+            engine_client=self.engine,
+            model_config=model_config,
+            models=OpenAIServingModels(
+                engine_client=self.engine,
+                model_config=model_config,
+                base_model_paths=[
+                    # The layer below us will only see resolved model IDs
+                    BaseModelPath(model_id_for_vllm, model_id_for_vllm)
+                ],
+            ),
+            response_role="assistant",
+            request_logger=None,  # Use default logging
+            chat_template=None,  # Use default template from model checkpoint
+            enable_auto_tools=True,
+            tool_parser=tool_parser,
+            chat_template_content_format="auto",
+        )
+        self.resolved_model_id = model_id_for_vllm
+        self.model_ids.add(model.model_id)
+
+        logger.info(f"Finished preloading model: {model_id_for_vllm}")

-        if configured_model.core_model_id != registered_model.core_model_id:
-            raise ValueError(
-                f"Requested model '{model.identifier}' is different from "
-                f"model '{self.config.model}' that this provider "
-                f"is configured to serve"
-            )
        return model

-    def _sampling_params(self, sampling_params: SamplingParams) -> VLLMSamplingParams:
-        if sampling_params is None:
-            return VLLMSamplingParams(max_tokens=self.config.max_tokens)
-
-        options = get_sampling_options(sampling_params)
-        if "repeat_penalty" in options:
-            options["repetition_penalty"] = options["repeat_penalty"]
-            del options["repeat_penalty"]
-
-        return VLLMSamplingParams(**options)
-
    async def unregister_model(self, model_id: str) -> None:
-        pass
+        """
+        Callback that is called when the server removes an inference endpoint from an inference
+        provider.
+
+        :param model_id: The same external ID that the higher layers of the stack previously passed
+        to :func:`register_model()`
+        """
+        if model_id not in self.model_ids:
+            raise ValueError(
+                f"Attempted to unregister model ID '{model_id}', but that ID is not registered to this provider."
+            )
+        self.model_ids.remove(model_id)
+
+        if len(self.model_ids) == 0:
+            # Last model was just unregistered. Shut down the connection to vLLM and free up
+            # resources.
+            # Note that this operation may cause in-flight chat completion requests on the
+            # now-unregistered model to return errors.
+            self.resolved_model_id = None
+            self.chat = None
+            self.engine.shutdown_background_loop()
+            self.engine = None
+
+    ###########################################################################
+    # METHODS INHERITED FROM Inference INTERFACE

    async def completion(
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        sampling_params: Optional[SamplingParams] = None,
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
-    ) -> CompletionResponse | CompletionResponseStreamChunk:
-        raise NotImplementedError("Completion not implemented for vLLM")
+    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        if model_id not in self.model_ids:
+            raise ValueError(
+                f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}"
+            )
+        if not isinstance(content, str):
+            raise NotImplementedError("Multimodal input not currently supported")
+        if sampling_params is None:
+            sampling_params = SamplingParams()

-    async def chat_completion(
-        self,
-        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
-        assert self.engine is not None
+        converted_sampling_params = _convert_sampling_params(sampling_params, response_format, logprobs)

-        request = ChatCompletionRequest(
-            model=model_id,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            stream=stream,
-            logprobs=logprobs,
-            tool_config=tool_config,
-        )
+        logger.debug(f"{converted_sampling_params=}")

-        log.info("Sampling params: %s", sampling_params)
-        request_id = _random_uuid()
-
-        prompt = await chat_completion_request_to_prompt(request, self.config.model)
-        vllm_sampling_params = self._sampling_params(request.sampling_params)
-        results_generator = self.engine.generate(prompt, vllm_sampling_params, request_id)
        if stream:
-            return self._stream_chat_completion(request, results_generator)
+            return self._streaming_completion(content, converted_sampling_params)
        else:
-            return await self._nonstream_chat_completion(request, results_generator)
-
-    async def _nonstream_chat_completion(
-        self, request: ChatCompletionRequest, results_generator: AsyncGenerator
-    ) -> ChatCompletionResponse:
-        outputs = [o async for o in results_generator]
-        final_output = outputs[-1]
-
-        assert final_output is not None
-        outputs = final_output.outputs
-        finish_reason = outputs[-1].stop_reason
-        choice = OpenAICompatCompletionChoice(
-            finish_reason=finish_reason,
-            text="".join([output.text for output in outputs]),
-        )
-        response = OpenAICompatCompletionResponse(
-            choices=[choice],
-        )
-        return process_chat_completion_response(response, request)
-
-    async def _stream_chat_completion(
-        self, request: ChatCompletionRequest, results_generator: AsyncGenerator
-    ) -> AsyncGenerator:
-        tokenizer = Tokenizer.get_instance()
-
-        async def _generate_and_convert_to_openai_compat():
-            cur = []
-            async for chunk in results_generator:
-                if not chunk.outputs:
-                    log.warning("Empty chunk received")
-                    continue
-
-                output = chunk.outputs[-1]
-
-                new_tokens = output.token_ids[len(cur) :]
-                text = tokenizer.decode(new_tokens)
-                cur.extend(new_tokens)
-                choice = OpenAICompatCompletionChoice(
-                    finish_reason=output.finish_reason,
-                    text=text,
-                )
-                yield OpenAICompatCompletionResponse(
-                    choices=[choice],
-                )
-
-        stream = _generate_and_convert_to_openai_compat()
-        async for chunk in process_chat_completion_stream_response(stream, request):
-            yield chunk
+            streaming_result = None
+            async for _ in self._streaming_completion(content, converted_sampling_params):
+                pass
+            return CompletionResponse(
+                content=streaming_result.delta,
+                stop_reason=streaming_result.stop_reason,
+                logprobs=streaming_result.logprobs,
+            )

    async def embeddings(
        self,
@ -242,3 +402,392 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
        task_type: Optional[EmbeddingTaskType] = None,
    ) -> EmbeddingsResponse:
        raise NotImplementedError()
+
+    async def chat_completion(
+        self,
+        model_id: str,
+        messages: List[Message],  # type: ignore
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,  # type: ignore
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
+    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
+        sampling_params = sampling_params or SamplingParams()
+        if model_id not in self.model_ids:
+            raise ValueError(
+                f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}"
+            )
+
+        # Convert to Llama Stack internal format for consistency
+        request = ChatCompletionRequest(
+            model=self.resolved_model_id,
+            messages=messages,
+            sampling_params=sampling_params,
+            response_format=response_format,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        if self.is_meta_llama_model:
+            # Bypass vLLM chat templating layer for Meta Llama models, because the
+            # templating layer in Llama Stack currently produces better results.
+            logger.debug(
+                f"Routing {self.resolved_model_id} chat completion through "
+                f"Llama Stack's templating layer instead of vLLM's."
+            )
+            return await self._chat_completion_for_meta_llama(request)
+
+        logger.debug(f"{self.resolved_model_id} is not a Meta Llama model")
+
+        # Arguments to the vLLM call must be packaged as a ChatCompletionRequest dataclass.
+        # Note that this dataclass has the same name as a similar dataclass in Llama Stack.
+        request_options = await llama_stack_chat_completion_to_openai_chat_completion_dict(request)
+        chat_completion_request = vllm.entrypoints.openai.protocol.ChatCompletionRequest(**request_options)
+
+        logger.debug(f"Converted request: {chat_completion_request}")
+
+        vllm_result = await self.chat.create_chat_completion(chat_completion_request)
+        logger.debug(f"Result from vLLM: {vllm_result}")
+        if isinstance(vllm_result, vllm.entrypoints.openai.protocol.ErrorResponse):
+            raise ValueError(f"Error from vLLM layer: {vllm_result}")
+
+        # Return type depends on "stream" argument
+        if stream:
+            if not isinstance(vllm_result, AsyncGenerator):
+                raise TypeError(f"Unexpected result type {type(vllm_result)} for streaming inference call")
+            # vLLM client returns a stream of strings, which need to be parsed.
+            # Stream comes in the form of an async generator.
+            return self._convert_streaming_results(vllm_result)
+        else:
+            if not isinstance(vllm_result, vllm.entrypoints.openai.protocol.ChatCompletionResponse):
+                raise TypeError(f"Unexpected result type {type(vllm_result)} for non-streaming inference call")
+            return self._convert_non_streaming_results(vllm_result)
+
+    ###########################################################################
+    # INTERNAL METHODS
+
+    async def _streaming_completion(
+        self, content: str, sampling_params: vllm.SamplingParams
+    ) -> AsyncIterator[CompletionResponseStreamChunk]:
+        """Internal implementation of :func:`completion()` API for the streaming case. Assumes
+        that arguments have been validated upstream.
+
+        :param content: Must be a string
+        :param sampling_params: Paramters from  public API's ``response_format``
+         and ``sampling_params`` arguments, converted to VLLM format
+        """
+        # We run agains the vLLM generate() call directly instead of using the OpenAI-compatible
+        # layer, because doing so simplifies the code here.
+
+        # The vLLM engine requires a unique identifier for each call to generate()
+        request_id = _random_uuid_str()
+
+        # The vLLM generate() API is streaming-only and returns an async generator.
+        # The generator returns objects of type vllm.RequestOutput.
+        results_generator = self.engine.generate(content, sampling_params, request_id)
+
+        # Need to know the model's EOS token ID for the conversion code below.
+        # AsyncLLMEngine is a wrapper around LLMEngine, and the tokenizer is only available if
+        # we drill down to the LLMEngine inside the AsyncLLMEngine.
+        # Similarly, the tokenizer in an LLMEngine is a wrapper around a BaseTokenizerGroup,
+        # and we need to drill down to the Hugging Face tokenizer inside the BaseTokenizerGroup.
+        llm_engine = self.engine.engine
+        tokenizer_group = llm_engine.tokenizer
+        eos_token_id = tokenizer_group.tokenizer.eos_token_id
+
+        request_output: vllm.RequestOutput = None
+        async for request_output in results_generator:
+            # Check for weird inference failures
+            if request_output.outputs is None or len(request_output.outputs) == 0:
+                # This case also should never happen
+                raise ValueError("Inference produced empty result")
+
+            # If we get here, then request_output contains the final output of the generate() call.
+            # The result may include multiple alternate outputs, but Llama Stack APIs only allow
+            # us to return one.
+            output: vllm.CompletionOutput = request_output.outputs[0]
+            completion_string = output.text
+
+            # Convert logprobs from vLLM's format to Llama Stack's format
+            logprobs = [
+                TokenLogProbs(logprobs_by_token={v.decoded_token: v.logprob for _, v in logprob_dict.items()})
+                for logprob_dict in output.logprobs
+            ]
+
+            # The final output chunk should be labeled with the reason that the overall generate()
+            # call completed.
+            logger.debug(f"{output.stop_reason=}; {type(output.stop_reason)=}")
+            if output.stop_reason is None:
+                stop_reason = None  # Still going
+            elif output.stop_reason == "stop":
+                stop_reason = StopReason.end_of_turn
+            elif output.stop_reason == "length":
+                stop_reason = StopReason.out_of_tokens
+            elif isinstance(output.stop_reason, int):
+                # If the model config specifies multiple end-of-sequence tokens, then vLLM
+                # will return the token ID of the EOS token in the stop_reason field.
+                stop_reason = StopReason.end_of_turn
+            else:
+                raise ValueError(f"Unrecognized stop reason '{output.stop_reason}'")
+
+            # vLLM's protocol outputs the stop token, then sets end of message on the next step for
+            # some reason.
+            if request_output.outputs[-1].token_ids[-1] == eos_token_id:
+                stop_reason = StopReason.end_of_message
+
+            yield CompletionResponseStreamChunk(delta=completion_string, stop_reason=stop_reason, logprobs=logprobs)
+
+        # Llama Stack requires that the last chunk have a stop reason, but vLLM doesn't always
+        # provide one if it runs out of tokens.
+        if stop_reason is None:
+            yield CompletionResponseStreamChunk(
+                delta=completion_string,
+                stop_reason=StopReason.out_of_tokens,
+                logprobs=logprobs,
+            )
+
+    def _convert_non_streaming_results(
+        self, vllm_result: vllm.entrypoints.openai.protocol.ChatCompletionResponse
+    ) -> ChatCompletionResponse:
+        """
+        Subroutine to convert the non-streaming output of vLLM's OpenAI-compatible API into an
+        equivalent Llama Stack object.
+
+        The result from vLLM's non-streaming API is a dataclass with the same name as the Llama
+        Stack ChatCompletionResponse dataclass, but with more and different field names. We ignore
+        the fields that aren't currently present in the Llama Stack dataclass.
+        """
+
+        # There may be multiple responses, but we can only pass through the first one.
+        if len(vllm_result.choices) == 0:
+            raise ValueError("Don't know how to convert response object without any responses")
+        vllm_message = vllm_result.choices[0].message
+        vllm_finish_reason = vllm_result.choices[0].finish_reason
+
+        converted_message = CompletionMessage(
+            role=vllm_message.role,
+            # Llama Stack API won't accept None for content field.
+            content=("" if vllm_message.content is None else vllm_message.content),
+            stop_reason=get_stop_reason(vllm_finish_reason),
+            tool_calls=[
+                ToolCall(
+                    call_id=t.id,
+                    tool_name=t.function.name,
+                    # vLLM function args come back as a string. Llama Stack expects JSON.
+                    arguments=json.loads(t.function.arguments),
+                    arguments_json=t.function.arguments,
+                )
+                for t in vllm_message.tool_calls
+            ],
+        )
+
+        # TODO: Convert logprobs
+
+        logger.debug(f"Converted message: {converted_message}")
+
+        return ChatCompletionResponse(
+            completion_message=converted_message,
+        )
+
+    async def _chat_completion_for_meta_llama(
+        self, request: ChatCompletionRequest
+    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        """
+        Subroutine that routes chat completions for Meta Llama models through Llama Stack's
+        chat template instead of using vLLM's version of that template. The Llama Stack version
+        of the chat template currently produces more reliable outputs.
+
+        Once vLLM's support for Meta Llama models has matured more, we should consider routing
+        Meta Llama requests through the vLLM chat completions API instead of using this method.
+        """
+        formatter = ChatFormat(Tokenizer.get_instance())
+
+        # Note that this function call modifies `request` in place.
+        prompt = await chat_completion_request_to_prompt(request, self.resolved_model_id)
+
+        model_id = list(self.model_ids)[0]  # Any model ID will do here
+        completion_response_or_iterator = await self.completion(
+            model_id=model_id,
+            content=prompt,
+            sampling_params=request.sampling_params,
+            response_format=request.response_format,
+            stream=request.stream,
+            logprobs=request.logprobs,
+        )
+
+        if request.stream:
+            if not isinstance(completion_response_or_iterator, AsyncIterator):
+                raise TypeError(
+                    f"Received unexpected result type {type(completion_response_or_iterator)}for streaming request."
+                )
+            return self._chat_completion_for_meta_llama_streaming(completion_response_or_iterator, request)
+
+        # elsif not request.stream:
+        if not isinstance(completion_response_or_iterator, CompletionResponse):
+            raise TypeError(
+                f"Received unexpected result type {type(completion_response_or_iterator)}for non-streaming request."
+            )
+        completion_response: CompletionResponse = completion_response_or_iterator
+        raw_message = formatter.decode_assistant_message_from_content(
+            completion_response.content, completion_response.stop_reason
+        )
+        return ChatCompletionResponse(
+            completion_message=CompletionMessage(
+                content=raw_message.content,
+                stop_reason=raw_message.stop_reason,
+                tool_calls=raw_message.tool_calls,
+            ),
+            logprobs=completion_response.logprobs,
+        )
+
+    async def _chat_completion_for_meta_llama_streaming(
+        self, results_iterator: AsyncIterator, request: ChatCompletionRequest
+    ) -> AsyncIterator:
+        """
+        Code from :func:`_chat_completion_for_meta_llama()` that needs to be a separate
+        method to keep asyncio happy.
+        """
+
+        # Convert to OpenAI format, then use shared code to convert to Llama Stack format.
+        async def _generate_and_convert_to_openai_compat():
+            chunk: CompletionResponseStreamChunk  # Make Pylance happy
+            last_text_len = 0
+            async for chunk in results_iterator:
+                if chunk.stop_reason == StopReason.end_of_turn:
+                    finish_reason = "stop"
+                elif chunk.stop_reason == StopReason.end_of_message:
+                    finish_reason = "eos"
+                elif chunk.stop_reason == StopReason.out_of_tokens:
+                    finish_reason = "length"
+                else:
+                    finish_reason = None
+
+                # Convert delta back to an actual delta
+                text_delta = chunk.delta[last_text_len:]
+                last_text_len = len(chunk.delta)
+
+                logger.debug(f"{text_delta=}; {finish_reason=}")
+
+                yield OpenAICompatCompletionResponse(
+                    choices=[OpenAICompatCompletionChoice(finish_reason=finish_reason, text=text_delta)]
+                )
+
+        stream = _generate_and_convert_to_openai_compat()
+        async for chunk in process_chat_completion_stream_response(stream, request):
+            logger.debug(f"Returning chunk: {chunk}")
+            yield chunk
+
+    async def _convert_streaming_results(self, vllm_result: AsyncIterator) -> AsyncIterator:
+        """
+        Subroutine that wraps the streaming outputs of vLLM's OpenAI-compatible
+        API into a second async iterator that returns Llama Stack objects.
+
+        :param vllm_result: Stream of strings that need to be parsed
+        """
+        # Tool calls come in pieces, but Llama Stack expects them in bigger chunks. We build up
+        # those chunks and output them at the end.
+        # This data structure holds the current set of partial tool calls.
+        index_to_tool_call: Dict[int, Dict] = dict()
+
+        # The Llama Stack event stream must always start with a start event. Use an empty one to
+        # simplify logic below
+        yield ChatCompletionResponseStreamChunk(
+            event=ChatCompletionResponseEvent(
+                event_type=ChatCompletionResponseEventType.start,
+                delta=TextDelta(text=""),
+                stop_reason=None,
+            )
+        )
+
+        converted_stop_reason = None
+        async for chunk_str in vllm_result:
+            # Due to OpenAI compatibility, each event in the stream will start with "data: " and
+            # end with "\n\n".
+            _prefix = "data: "
+            _suffix = "\n\n"
+            if not chunk_str.startswith(_prefix) or not chunk_str.endswith(_suffix):
+                raise ValueError(f"Can't parse result string from vLLM: '{re.escape(chunk_str)}'")
+
+            # In between the "data: " and newlines is an event record
+            data_str = chunk_str[len(_prefix) : -len(_suffix)]
+
+            # The end of the stream is indicated with "[DONE]"
+            if data_str == "[DONE]":
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.complete,
+                        delta=TextDelta(text=""),
+                        stop_reason=converted_stop_reason,
+                    )
+                )
+                return
+
+            # Anything that is not "[DONE]" should be a JSON record
+            parsed_chunk = json.loads(data_str)
+
+            logger.debug(f"Parsed JSON event to:\n{json.dumps(parsed_chunk, indent=2)}")
+
+            # The result may contain multiple completions, but Llama Stack APIs only support
+            # returning one.
+            first_choice = parsed_chunk["choices"][0]
+            converted_stop_reason = get_stop_reason(first_choice["finish_reason"])
+            delta_record = first_choice["delta"]
+
+            if "content" in delta_record:
+                # Text delta
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=TextDelta(text=delta_record["content"]),
+                        stop_reason=converted_stop_reason,
+                    )
+                )
+            elif "tool_calls" in delta_record:
+                # Tool call(s). Llama Stack APIs do not have a clear way to return partial tool
+                # calls, so buffer until we get a "tool calls" stop reason
+                for tc in delta_record["tool_calls"]:
+                    index = tc["index"]
+                    if index not in index_to_tool_call:
+                        # First time this tool call is showing up
+                        index_to_tool_call[index] = dict()
+                    tool_call = index_to_tool_call[index]
+                    if "id" in tc:
+                        tool_call["call_id"] = tc["id"]
+                    if "function" in tc:
+                        if "name" in tc["function"]:
+                            tool_call["tool_name"] = tc["function"]["name"]
+                        if "arguments" in tc["function"]:
+                            # Arguments comes in as pieces of a string
+                            if "arguments_str" not in tool_call:
+                                tool_call["arguments_str"] = ""
+                            tool_call["arguments_str"] += tc["function"]["arguments"]
+            else:
+                raise ValueError(f"Don't know how to parse event delta: {delta_record}")
+
+            if first_choice["finish_reason"] == "tool_calls":
+                # Special OpenAI code for "tool calls complete".
+                # Output the buffered tool calls. Llama Stack requires a separate event per tool
+                # call.
+                for tool_call_record in index_to_tool_call.values():
+                    # Arguments come in as a string. Parse the completed string.
+                    tool_call_record["arguments"] = json.loads(tool_call_record["arguments_str"])
+                    del tool_call_record["arguments_str"]
+
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(tool_call=tool_call_record, parse_status="succeeded"),
+                            stop_reason=converted_stop_reason,
+                        )
+                    )
+
+        # If we get here, we've lost the connection with the vLLM event stream before it ended
+        # normally.
+        raise ValueError("vLLM event stream ended without [DONE] message.")
--- a/llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift
+++ b/llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift
@ -45,7 +45,7 @@ public class LocalInference: Inference {
          var tokens: [String] = []

          let prompt = try encodeDialogPrompt(messages: prepareMessages(request: request))
-          var stopReason: Components.Schemas.StopReason? = nil
+          var stopReason: Components.Schemas.CompletionMessage.stop_reasonPayload? = nil
          var buffer = ""
          var ipython = false
          var echoDropped = false
@ -69,13 +69,13 @@ public class LocalInference: Inference {
              continuation.yield(
                Components.Schemas.ChatCompletionResponseStreamChunk(
                  event: Components.Schemas.ChatCompletionResponseEvent(
+                    event_type: .progress,
                    delta: .tool_call(Components.Schemas.ToolCallDelta(
-                      parse_status: Components.Schemas.ToolCallParseStatus.started,
+                      _type: Components.Schemas.ToolCallDelta._typePayload.tool_call,
                      tool_call: .case1(""),
-                      _type: Components.Schemas.ToolCallDelta._typePayload.tool_call
+                      parse_status: Components.Schemas.ToolCallDelta.parse_statusPayload.started
                      )
-                    ),
-                    event_type: .progress
+                    )
                  )
                )
              )
@ -89,9 +89,9 @@ public class LocalInference: Inference {

            var text = ""
            if token == "<|eot_id|>" {
-              stopReason = Components.Schemas.StopReason.end_of_turn
+              stopReason = Components.Schemas.CompletionMessage.stop_reasonPayload.end_of_turn
            } else if token == "<|eom_id|>" {
-              stopReason = Components.Schemas.StopReason.end_of_message
+              stopReason = Components.Schemas.CompletionMessage.stop_reasonPayload.end_of_message
            } else {
              text = token
            }
@ -99,14 +99,15 @@ public class LocalInference: Inference {
            var delta: Components.Schemas.ContentDelta
            if ipython {
              delta = .tool_call(Components.Schemas.ToolCallDelta(
-                parse_status: .in_progress,
+                _type: .tool_call,
                tool_call: .case1(text),
-                _type: .tool_call
+                parse_status: .in_progress
              ))
            } else {
              delta = .text(Components.Schemas.TextDelta(
-                text: text,
-                _type: Components.Schemas.TextDelta._typePayload.text)
+                _type: Components.Schemas.TextDelta._typePayload.text,
+                text: text
+                )
              )
            }

@ -114,8 +115,8 @@ public class LocalInference: Inference {
              continuation.yield(
                Components.Schemas.ChatCompletionResponseStreamChunk(
                  event: Components.Schemas.ChatCompletionResponseEvent(
-                    delta: delta,
-                    event_type: .progress
+                    event_type: .progress,
+                    delta: delta
                  )
                )
              )
@ -123,41 +124,41 @@ public class LocalInference: Inference {
          }

          if stopReason == nil {
-            stopReason = Components.Schemas.StopReason.out_of_tokens
+            stopReason = Components.Schemas.CompletionMessage.stop_reasonPayload.out_of_tokens
          }

          let message = decodeAssistantMessage(tokens: tokens.joined(), stopReason: stopReason!)
          // TODO: non-streaming support

-          let didParseToolCalls = message.tool_calls.count > 0
+          let didParseToolCalls = message.tool_calls?.count ?? 0 > 0
          if ipython && !didParseToolCalls {
            continuation.yield(
              Components.Schemas.ChatCompletionResponseStreamChunk(
                event: Components.Schemas.ChatCompletionResponseEvent(
+                  event_type: .progress,
                  delta: .tool_call(Components.Schemas.ToolCallDelta(
-                    parse_status: Components.Schemas.ToolCallParseStatus.failed,
+                    _type: Components.Schemas.ToolCallDelta._typePayload.tool_call,
                    tool_call: .case1(""),
-                    _type: Components.Schemas.ToolCallDelta._typePayload.tool_call
+                    parse_status: Components.Schemas.ToolCallDelta.parse_statusPayload.failed
                    )
-                  ),
-                  event_type: .progress
+                  )
                )
                // TODO: stopReason
              )
            )
          }

-          for toolCall in message.tool_calls {
+          for toolCall in message.tool_calls! {
            continuation.yield(
              Components.Schemas.ChatCompletionResponseStreamChunk(
                event: Components.Schemas.ChatCompletionResponseEvent(
+                  event_type: .progress,
                  delta: .tool_call(Components.Schemas.ToolCallDelta(
-                    parse_status: Components.Schemas.ToolCallParseStatus.succeeded,
+                    _type: Components.Schemas.ToolCallDelta._typePayload.tool_call,
                    tool_call: Components.Schemas.ToolCallDelta.tool_callPayload.ToolCall(toolCall),
-                    _type: Components.Schemas.ToolCallDelta._typePayload.tool_call
+                    parse_status: Components.Schemas.ToolCallDelta.parse_statusPayload.succeeded
                    )
-                  ),
-                  event_type: .progress
+                  )
                )
                // TODO: stopReason
              )
@ -167,11 +168,12 @@ public class LocalInference: Inference {
          continuation.yield(
            Components.Schemas.ChatCompletionResponseStreamChunk(
              event: Components.Schemas.ChatCompletionResponseEvent(
+                event_type: .complete,
                delta: .text(Components.Schemas.TextDelta(
-                  text: "",
-                  _type: Components.Schemas.TextDelta._typePayload.text)
-                ),
-                event_type: .complete
+                  _type: Components.Schemas.TextDelta._typePayload.text,
+                  text: ""
+                  )
+                )
              )
              // TODO: stopReason
            )
--- a/llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift
+++ b/llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift
@ -38,10 +38,10 @@ func encodeMessage(message: Components.Schemas.Message) -> String {

  switch (message) {
  case .assistant(let m):
-    if (m.tool_calls.count > 0) {
+    if (m.tool_calls?.count ?? 0 > 0) {
      prompt += "<|python_tag|>"
    }
-  default:
+  default:0
    break
  }

@ -91,7 +91,7 @@ func encodeMessage(message: Components.Schemas.Message) -> String {
    // for t in m.tool_calls {
    //  _processContent(t.)
    //}
-    eom = m.stop_reason == Components.Schemas.StopReason.end_of_message
+    eom = m.stop_reason == Components.Schemas.CompletionMessage.stop_reasonPayload.end_of_message
  case .system(_):
    break
  case .tool(_):
@ -124,8 +124,9 @@ func prepareMessages(request: Components.Schemas.ChatCompletionRequest) throws -
  sysContent += try defaultTemplate.render()

  messages.append(.system(Components.Schemas.SystemMessage(
-    content: .case1(sysContent),
-    role: .system))
+    role: .system,
+    content: .case1(sysContent)
+    ))
  )

  if request.tools?.isEmpty == false {
@ -134,8 +135,8 @@ func prepareMessages(request: Components.Schemas.ChatCompletionRequest) throws -
    let toolTemplate = try toolGen.gen(customTools: request.tools!)
    let tools = try toolTemplate.render()
    messages.append(.user(Components.Schemas.UserMessage(
-      content: .case1(tools),
-      role: .user)
+      role: .user,
+      content: .case1(tools))
    ))
  }

@ -193,9 +194,9 @@ public func maybeExtractCustomToolCalls(input: String) -> [Components.Schemas.To

      result.append(
        Components.Schemas.ToolCall(
-          arguments: .init(additionalProperties: props),
          call_id: UUID().uuidString,
-          tool_name: .case2(name) // custom_tool
+          tool_name: .case2(name), // custom_tool
+          arguments: .init(additionalProperties: props)
        )
      )
    }
@ -206,7 +207,7 @@ public func maybeExtractCustomToolCalls(input: String) -> [Components.Schemas.To
  }
 }

-func decodeAssistantMessage(tokens: String, stopReason: Components.Schemas.StopReason) -> Components.Schemas.CompletionMessage {
+func decodeAssistantMessage(tokens: String, stopReason: Components.Schemas.CompletionMessage.stop_reasonPayload) -> Components.Schemas.CompletionMessage {
  var content = tokens

  let roles = ["user", "system", "assistant"]
@ -229,8 +230,8 @@ func decodeAssistantMessage(tokens: String, stopReason: Components.Schemas.StopR
  }

  return Components.Schemas.CompletionMessage(
-    content: .case1(content),
    role: .assistant,
+    content: .case1(content),
    stop_reason: stopReason,
    tool_calls: maybeExtractCustomToolCalls(input: content)
  )
--- a/llama_stack/providers/inline/ios/inference/executorch
+++ b/llama_stack/providers/inline/ios/inference/executorch
@ -1 +0,0 @@
-Subproject commit 9b6d4b4a7b9b8f811bb6b269b0c2ce254e3a0c1b
--- a/llama_stack/providers/inline/post_training/common/validator.py
+++ b/llama_stack/providers/inline/post_training/common/validator.py
@ -9,6 +9,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+
+from typing import Any
+
 from llama_stack.apis.common.type_system import (
    ChatCompletionInputType,
    DialogType,
@ -20,7 +23,7 @@ from llama_stack.providers.utils.common.data_schema_validator import (
    validate_dataset_schema,
 )

-EXPECTED_DATASET_SCHEMA = {
+EXPECTED_DATASET_SCHEMA: dict[str, list[dict[str, Any]]] = {
    "instruct": [
        {
            ColumnName.chat_completion_input.value: ChatCompletionInputType(),
@ -41,6 +44,9 @@ async def validate_input_dataset_schema(
    dataset_type: str,
 ) -> None:
    dataset_def = await datasets_api.get_dataset(dataset_id=dataset_id)
+    if not dataset_def:
+        raise ValueError(f"Dataset {dataset_id} does not exist.")
+
    if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
        raise ValueError(f"Dataset {dataset_id} does not have a schema defined.")

--- a/llama_stack/providers/inline/post_training/torchtune/init.py
+++ b/llama_stack/providers/inline/post_training/torchtune/init.py
@ -4,9 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
+from typing import Any, Dict

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import TorchtunePostTrainingConfig

@ -15,7 +15,7 @@ from .config import TorchtunePostTrainingConfig

 async def get_provider_impl(
    config: TorchtunePostTrainingConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
    from .post_training import TorchtunePostTrainingImpl

--- a/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
+++ b/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
@ -4,15 +4,25 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import json
 import os
 import shutil
 from pathlib import Path
 from typing import Any, Dict, List

 import torch
+from safetensors.torch import save_file
 from torchtune import training
 from torchtune.models import convert_weights
-from torchtune.training.checkpointing._utils import ModelType, safe_torch_load
+from torchtune.training.checkpointing._utils import (
+    ADAPTER_CONFIG_FNAME,
+    ADAPTER_MODEL_FNAME,
+    REPO_ID_FNAME,
+    SUFFIXES_TO_NOT_COPY,
+    ModelType,
+    copy_files,
+    safe_torch_load,
+)
 from torchtune.utils._logging import get_logger

 logger = get_logger("DEBUG")
@ -27,7 +37,7 @@ class TorchtuneCheckpointer:
        checkpoint_files: List[str],
        output_dir: str,
        model_type: str,
-    ) -> None:
+    ):
        # Fail fast if ``checkpoint_files`` is invalid
        # TODO: support loading more than one file
        if len(checkpoint_files) != 1:
@ -48,7 +58,7 @@ class TorchtuneCheckpointer:
        """
        Load Meta checkpoint from file. Currently only loading from a single file is supported.
        """
-        state_dict: Dict[str:Any] = {}
+        state_dict: Dict[str, Any] = {}
        model_state_dict = safe_torch_load(self._checkpoint_path)
        if self._model_type == ModelType.LLAMA3_VISION:
            from torchtune.models.llama3_2_vision._convert_weights import (
@ -75,9 +85,24 @@ class TorchtuneCheckpointer:
        state_dict: Dict[str, Any],
        epoch: int,
        adapter_only: bool = False,
+        checkpoint_format: str | None = None,
    ) -> str:
        model_file_path = Path(self._output_dir) / f"{self._model_id}-{self._training_algorithm}-{epoch}"
+        if checkpoint_format == "meta" or checkpoint_format is None:
+            self._save_meta_format_checkpoint(model_file_path, state_dict, adapter_only)
+        elif checkpoint_format == "huggingface":
+            # Note: for saving hugging face format checkpoints, we only suppport saving adapter weights now
+            self._save_hf_format_checkpoint(model_file_path, state_dict)
+        else:
+            raise ValueError(f"Unsupported checkpoint format: {format}")
+        return str(model_file_path)

+    def _save_meta_format_checkpoint(
+        self,
+        model_file_path: Path,
+        state_dict: Dict[str, Any],
+        adapter_only: bool = False,
+    ) -> None:
        model_file_path.mkdir(parents=True, exist_ok=True)

        # copy the related files for inference
@ -140,6 +165,76 @@ class TorchtuneCheckpointer:
                "Adapter checkpoint not found in state_dict. Please ensure that the state_dict contains adapter weights."
            )

-        print("model_file_path", str(model_file_path))
+    def _save_hf_format_checkpoint(
+        self,
+        model_file_path: Path,
+        state_dict: Dict[str, Any],
+    ) -> None:
+        # the config.json file contains model params needed for state dict conversion
+        config = json.loads(Path.joinpath(self._checkpoint_dir.parent, "config.json").read_text())

-        return str(model_file_path)
+        # repo_id is necessary for when saving an adapter config, so its compatible with HF.
+        # This json file is produced and saved in the download step.
+        # contents are {"repo_id": "some_model/some_model_version"}
+        repo_id_path = Path.joinpath(self._checkpoint_dir.parent, REPO_ID_FNAME).with_suffix(".json")
+        self.repo_id = None
+        if repo_id_path.exists():
+            with open(repo_id_path, "r") as json_file:
+                data = json.load(json_file)
+                self.repo_id = data.get("repo_id")
+
+        if training.ADAPTER_KEY in state_dict:
+            # TODO: saving it "as is" is a requirement because, if we only save with
+            # convert_weights.tune_to_peft_adapter_weights, we do NOT have a fn
+            # convert_weights.peft_to_tune. The .pt format is not needed, but
+            # it is an easy way to distinguish the adapters. Ideally we should save only one.
+            output_path = Path.joinpath(model_file_path, ADAPTER_MODEL_FNAME).with_suffix(".pt")
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            torch.save(state_dict[training.ADAPTER_KEY], output_path)
+            logger.info(
+                f"Adapter checkpoint of size {os.path.getsize(output_path) / 1024**3:.2f} GiB saved to {output_path}"
+            )
+
+            state_dict[training.ADAPTER_KEY] = convert_weights.tune_to_peft_adapter_weights(
+                state_dict[training.ADAPTER_KEY],
+                num_heads=config["num_attention_heads"],
+                num_kv_heads=config["num_key_value_heads"],
+                dim=config["hidden_size"],
+                head_dim=config.get("head_dim", None),
+            )
+            output_path = Path.joinpath(model_file_path, "adapter", ADAPTER_MODEL_FNAME)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path = output_path.with_suffix(".safetensors")
+            save_file(
+                state_dict[training.ADAPTER_KEY],
+                output_path,
+                metadata={"format": "pt"},
+            )
+            logger.info(
+                f"Adapter checkpoint of size {os.path.getsize(output_path) / 1024**3:.2f} GiB saved to {output_path}"
+            )
+        else:
+            raise ValueError(
+                "Adapter checkpoint not found in state_dict. Please ensure that the state_dict contains adapter weights."
+            )
+
+        if training.ADAPTER_CONFIG in state_dict:
+            state_dict[training.ADAPTER_CONFIG] = convert_weights.tune_to_peft_adapter_config(
+                adapter_config=state_dict[training.ADAPTER_CONFIG],
+                base_model_name_or_path=self.repo_id,
+            )
+
+            output_path = Path.joinpath(model_file_path, "adapter", ADAPTER_CONFIG_FNAME).with_suffix(".json")
+            with open(output_path, "w") as f:
+                json.dump(state_dict[training.ADAPTER_CONFIG], f)
+            logger.info(
+                f"Adapter checkpoint of size {os.path.getsize(output_path) / 1024**3:.2f} GiB saved to {output_path}"
+            )
+
+        # Save all files in ckpt_dir, except model weights and mapping, to output_dir/epoch_{epoch}
+        # So its easy to run inference with the model using this epoch's checkpoint
+        copy_files(
+            self._checkpoint_dir.parent,
+            model_file_path,
+            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
+        )
--- a/llama_stack/providers/inline/post_training/torchtune/common/utils.py
+++ b/llama_stack/providers/inline/post_training/torchtune/common/utils.py
@ -10,7 +10,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Callable, Dict
+from typing import Callable, Dict

 import torch
 from pydantic import BaseModel
@ -25,10 +25,13 @@ from llama_stack.apis.post_training import DatasetFormat
 from llama_stack.models.llama.datatypes import Model
 from llama_stack.models.llama.sku_list import resolve_model

+BuildLoraModelCallable = Callable[..., torch.nn.Module]
+BuildTokenizerCallable = Callable[..., Llama3Tokenizer]
+

 class ModelConfig(BaseModel):
-    model_definition: Any
-    tokenizer_type: Any
+    model_definition: BuildLoraModelCallable
+    tokenizer_type: BuildTokenizerCallable
    checkpoint_type: str


@ -51,10 +54,6 @@ DATA_FORMATS: Dict[str, Transform] = {
 }


-BuildLoraModelCallable = Callable[..., torch.nn.Module]
-BuildTokenizerCallable = Callable[..., Llama3Tokenizer]
-
-
 def _validate_model_id(model_id: str) -> Model:
    model = resolve_model(model_id)
    if model is None or model.core_model_id.value not in MODEL_CONFIGS:
--- a/llama_stack/providers/inline/post_training/torchtune/config.py
+++ b/llama_stack/providers/inline/post_training/torchtune/config.py
@ -4,10 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional
+from typing import Any, Dict, Literal, Optional

 from pydantic import BaseModel


 class TorchtunePostTrainingConfig(BaseModel):
    torch_seed: Optional[int] = None
+    checkpoint_format: Optional[Literal["meta", "huggingface"]] = "meta"
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "checkpoint_format": "meta",
+        }
--- a/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
+++ b/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
@ -10,16 +10,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.

+import json
 from typing import Any, Mapping

 from llama_stack.providers.utils.common.data_schema_validator import ColumnName


-def llama_stack_instruct_to_torchtune_instruct(sample: Mapping[str, Any]) -> Mapping[str, Any]:
+def llama_stack_instruct_to_torchtune_instruct(
+    sample: Mapping[str, Any],
+) -> Mapping[str, Any]:
    assert ColumnName.chat_completion_input.value in sample and ColumnName.expected_answer.value in sample, (
        "Invalid input row"
    )
-    input_messages = eval(str(sample[ColumnName.chat_completion_input.value]))
+    input_messages = json.loads(sample[ColumnName.chat_completion_input.value])

    assert len(input_messages) == 1, "llama stack intruct dataset format only supports 1 user message"
    input_message = input_messages[0]
@ -37,7 +40,7 @@ def llama_stack_instruct_to_torchtune_instruct(sample: Mapping[str, Any]) -> Map
 def llama_stack_chat_to_torchtune_chat(sample: Mapping[str, Any]) -> Mapping[str, Any]:
    assert ColumnName.dialog.value in sample, "Invalid input row"
    role_map = {"user": "human", "assistant": "gpt"}
-    dialog = eval(str(sample[ColumnName.dialog.value]))
+    dialog = json.loads(sample[ColumnName.dialog.value])

    assert len(dialog) > 1, "dialog must have at least 2 messagse"
    roles = []
--- a/llama_stack/providers/inline/post_training/torchtune/datasets/sft.py
+++ b/llama_stack/providers/inline/post_training/torchtune/datasets/sft.py
@ -55,7 +55,7 @@ class SFTDataset(Dataset):
        if "messages" in transformed_sample:
            validate_messages(transformed_sample["messages"])

-        tokenized_dict = self._model_transform(transformed_sample)
+        tokenized_dict: dict[str, Any] = self._model_transform(transformed_sample)

        if not ("tokens" in tokenized_dict and "mask" in tokenized_dict):
            keys_str = ", ".join(tokenized_dict.keys())
--- a/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py
@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import Any, Dict, Optional

 from llama_stack.apis.datasetio import DatasetIO
@ -43,6 +43,9 @@ class TorchtunePostTrainingImpl:
        self.jobs = {}
        self.checkpoints_dict = {}

+    async def shutdown(self):
+        pass
+
    async def supervised_fine_tune(
        self,
        job_uuid: str,
@ -61,7 +64,7 @@ class TorchtunePostTrainingImpl:
        job_status_response = PostTrainingJobStatusResponse(
            job_uuid=job_uuid,
            status=JobStatus.scheduled,
-            scheduled_at=datetime.now(),
+            scheduled_at=datetime.now(timezone.utc),
        )
        self.jobs[job_uuid] = job_status_response

@ -81,7 +84,7 @@ class TorchtunePostTrainingImpl:
                )

                job_status_response.status = JobStatus.in_progress
-                job_status_response.started_at = datetime.now()
+                job_status_response.started_at = datetime.now(timezone.utc)

                await recipe.setup()
                resources_allocated, checkpoints = await recipe.train()
@ -90,7 +93,7 @@ class TorchtunePostTrainingImpl:
                job_status_response.resources_allocated = resources_allocated
                job_status_response.checkpoints = checkpoints
                job_status_response.status = JobStatus.completed
-                job_status_response.completed_at = datetime.now()
+                job_status_response.completed_at = datetime.now(timezone.utc)

            except Exception:
                job_status_response.status = JobStatus.failed
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@ -8,7 +8,7 @@ import gc
 import logging
 import os
 import time
-from datetime import datetime
+from datetime import datetime, timezone
 from functools import partial
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
@ -37,10 +37,10 @@ from llama_stack.apis.common.training_types import PostTrainingMetric
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.post_training import (
-    AlgorithmConfig,
    Checkpoint,
    LoraFinetuningConfig,
    OptimizerConfig,
+    QATFinetuningConfig,
    TrainingConfig,
 )
 from llama_stack.distribution.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
@ -73,6 +73,9 @@ class LoraFinetuningSingleDevice:

    # Currently logging only logs limited training metrics to local disk
    # will figure out more loggings and how it works with telemetry in future PRs
+
+    _checkpointer: TorchtuneCheckpointer
+
    def __init__(
        self,
        config: TorchtunePostTrainingConfig,
@ -82,7 +85,7 @@ class LoraFinetuningSingleDevice:
        logger_config: Dict[str, Any],
        model: str,
        checkpoint_dir: Optional[str],
-        algorithm_config: Optional[AlgorithmConfig],
+        algorithm_config: LoraFinetuningConfig | QATFinetuningConfig | None,
        datasetio_api: DatasetIO,
        datasets_api: Datasets,
    ) -> None:
@ -109,14 +112,15 @@ class LoraFinetuningSingleDevice:
            return str(checkpoint_dir)

        if checkpoint_dir and checkpoint_dir != "null":
-            self.checkpoint_dir = config.checkpoint_dir
+            self.checkpoint_dir = checkpoint_dir
        else:
-            model = resolve_model(self.model_id)
-            if model is None:
+            model_obj = resolve_model(self.model_id)
+            if model_obj is None:
                raise ValueError(f"{self.model_id} not found. Your model id should be in the llama models SKU list")
-            self.checkpoint_dir = model_checkpoint_dir(model)
+            self.checkpoint_dir = model_checkpoint_dir(model_obj)

        self._output_dir = str(DEFAULT_CHECKPOINT_DIR)
+        self._checkpoint_format = config.checkpoint_format

        self.seed = training.set_seed(seed=config.torch_seed)
        self.epochs_run = 0
@ -134,16 +138,16 @@ class LoraFinetuningSingleDevice:
        self.max_validation_steps = training_config.max_validation_steps

        self._clip_grad_norm = 1.0
-        self._enable_activation_checkpointing = (
-            (training_config.efficiency_config.enable_activation_checkpointing)
-            if training_config.efficiency_config
-            else False
-        )
-        self._enable_activation_offloading = (
-            (training_config.efficiency_config.enable_activation_offloading)
-            if training_config.efficiency_config
-            else False
-        )
+
+        self._enable_activation_checkpointing = False
+        self._enable_activation_offloading = False
+        if training_config.efficiency_config:
+            if training_config.efficiency_config.enable_activation_checkpointing:
+                self._enable_activation_checkpointing = (
+                    training_config.efficiency_config.enable_activation_checkpointing
+                )
+            if training_config.efficiency_config.enable_activation_offloading:
+                self._enable_activation_offloading = training_config.efficiency_config.enable_activation_offloading

        self.datasetio_api = datasetio_api
        self.datasets_api = datasets_api
@ -263,7 +267,7 @@ class LoraFinetuningSingleDevice:
            )

        self.adapter_params = get_adapter_params(model)
-        self._is_dora = any(["magnitude" in k for k in self.adapter_params.keys()])
+        self._is_dora = any("magnitude" in k for k in self.adapter_params.keys())

        set_trainable_params(model, self.adapter_params)

@ -327,13 +331,13 @@ class LoraFinetuningSingleDevice:
        batch_size: int,
    ) -> Tuple[DistributedSampler, DataLoader]:
        async def fetch_rows(dataset_id: str):
-            return await self.datasetio_api.get_rows_paginated(
+            return await self.datasetio_api.iterrows(
                dataset_id=dataset_id,
-                rows_in_page=-1,
+                limit=-1,
            )

        all_rows = await fetch_rows(dataset_id)
-        rows = all_rows.rows
+        rows = all_rows.data

        await validate_input_dataset_schema(
            datasets_api=self.datasets_api,
@ -419,6 +423,7 @@ class LoraFinetuningSingleDevice:
        return self._checkpointer.save_checkpoint(
            ckpt_dict,
            epoch=epoch,
+            checkpoint_format=self._checkpoint_format,
        )

    async def _loss_step(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
@ -449,18 +454,18 @@ class LoraFinetuningSingleDevice:
        """
        # Initialize tokens count and running loss (for grad accumulation)
        t0 = time.perf_counter()
-        running_loss = 0
+        running_loss: float = 0.0
        num_tokens = 0

        # training artifacts
        checkpoints = []
-        memory_stats = {}
+        memory_stats: Dict[str, Any] = {}

        # self.epochs_run should be non-zero when we're resuming from a checkpoint
        for curr_epoch in range(self.epochs_run, self.total_epochs):
            # Update the sampler to ensure data is correctly shuffled across epochs
            # in case shuffle is True
-            metric_logger = DiskLogger(log_dir=self._output_dir + f"/{self.model_id}-sft-{curr_epoch}")
+            metric_logger = DiskLogger(log_dir=self._output_dir + f"/{self.model_id}-sft-{curr_epoch}/log")
            self._training_sampler.set_epoch(curr_epoch)
            loss_to_log = 0.0

@ -482,7 +487,7 @@ class LoraFinetuningSingleDevice:
                # Loss is normalized by default so we multiply by the number of tokens
                # This way we can normalize by the total number of tokens if we're accumulating gradients
                current_loss = await self._loss_step(batch) * current_num_tokens
-                running_loss += current_loss
+                running_loss += current_loss.detach().item()
                current_loss.backward()

                # Step with optimizer
@ -498,7 +503,7 @@ class LoraFinetuningSingleDevice:
                    # Update the number of steps when the weights are updated
                    self.global_step += 1

-                    loss_to_log = running_loss.item() / num_tokens
+                    loss_to_log = running_loss / num_tokens

                    pbar.update(1)
                    pbar.set_description(f"{curr_epoch + 1}|{self.global_step}|Loss: {loss_to_log}")
@ -521,7 +526,7 @@ class LoraFinetuningSingleDevice:
                    )

                    # Reset running stats for the next step
-                    running_loss = 0
+                    running_loss = 0.0
                    num_tokens = 0
                    t0 = time.perf_counter()

@ -530,7 +535,7 @@ class LoraFinetuningSingleDevice:
            checkpoint_path = await self.save_checkpoint(epoch=curr_epoch)
            checkpoint = Checkpoint(
                identifier=f"{self.model_id}-sft-{curr_epoch}",
-                created_at=datetime.now(),
+                created_at=datetime.now(timezone.utc),
                epoch=curr_epoch,
                post_training_job_id=self.job_uuid,
                path=checkpoint_path,
@ -547,10 +552,11 @@ class LoraFinetuningSingleDevice:
            checkpoints.append(checkpoint)

        # clean up the memory after training finishes
-        self._model.to("cpu")
+        if self._device.type != "cpu":
+            self._model.to("cpu")
+            torch.cuda.empty_cache()
        del self._model
        gc.collect()
-        torch.cuda.empty_cache()

        return (memory_stats, checkpoints)

--- a/llama_stack/providers/inline/safety/code_scanner/init.py
+++ b/llama_stack/providers/inline/safety/code_scanner/init.py
@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from .config import CodeScannerConfig


-async def get_provider_impl(config: CodeScannerConfig, deps):
+async def get_provider_impl(config: CodeScannerConfig, deps: Dict[str, Any]):
    from .code_scanner import MetaReferenceCodeScannerSafetyImpl

    impl = MetaReferenceCodeScannerSafetyImpl(config, deps)
--- a/llama_stack/providers/inline/safety/code_scanner/config.py
+++ b/llama_stack/providers/inline/safety/code_scanner/config.py
@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from pydantic import BaseModel


 class CodeScannerConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/safety/llama_guard/init.py
+++ b/llama_stack/providers/inline/safety/llama_guard/init.py
@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from .config import LlamaGuardConfig


-async def get_provider_impl(config: LlamaGuardConfig, deps):
+async def get_provider_impl(config: LlamaGuardConfig, deps: Dict[str, Any]):
    from .llama_guard import LlamaGuardSafetyImpl

    assert isinstance(config, LlamaGuardConfig), f"Unexpected config type: {type(config)}"
--- a/llama_stack/providers/inline/safety/llama_guard/config.py
+++ b/llama_stack/providers/inline/safety/llama_guard/config.py
@ -4,10 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import List
+from typing import Any, Dict, List

 from pydantic import BaseModel


 class LlamaGuardConfig(BaseModel):
    excluded_categories: List[str] = []
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "excluded_categories": [],
+        }
--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@ -227,13 +227,6 @@ class LlamaGuardShield:
        if len(messages) >= 2 and (messages[0].role == Role.user.value and messages[1].role == Role.user.value):
            messages = messages[1:]

-        for i in range(1, len(messages)):
-            if messages[i].role == messages[i - 1].role:
-                for i, m in enumerate(messages):
-                    print(f"{i}: {m.role}: {m.content}")
-                raise ValueError(
-                    f"Messages must alternate between user and assistant. Message {i} has the same role as message {i - 1}"
-                )
        return messages

    async def run(self, messages: List[Message]) -> RunShieldResponse:
--- a/llama_stack/providers/inline/safety/prompt_guard/init.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/init.py
@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from .config import PromptGuardConfig  # noqa: F401


-async def get_provider_impl(config: PromptGuardConfig, deps):
+async def get_provider_impl(config: PromptGuardConfig, deps: Dict[str, Any]):
    from .prompt_guard import PromptGuardSafetyImpl

    impl = PromptGuardSafetyImpl(config, deps)
--- a/llama_stack/providers/inline/safety/prompt_guard/config.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/config.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 from enum import Enum
+from typing import Any, Dict

 from pydantic import BaseModel, field_validator

@ -23,3 +24,9 @@ class PromptGuardConfig(BaseModel):
        if v not in [t.value for t in PromptGuardType]:
            raise ValueError(f"Unknown prompt guard type: {v}")
        return v
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "guard_type": "injection",
+        }
--- a/llama_stack/providers/inline/scoring/basic/init.py
+++ b/llama_stack/providers/inline/scoring/basic/init.py
@ -3,16 +3,16 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import BasicScoringConfig


 async def get_provider_impl(
    config: BasicScoringConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
    from .scoring import BasicScoringImpl

--- a/llama_stack/providers/inline/scoring/basic/config.py
+++ b/llama_stack/providers/inline/scoring/basic/config.py
@ -3,7 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel


-class BasicScoringConfig(BaseModel): ...
+class BasicScoringConfig(BaseModel):
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@ -22,11 +22,25 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 )

 from .config import BasicScoringConfig
+from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
+from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
+from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn
+from .scoring_fn.regex_parser_math_response_scoring_fn import (
+    RegexParserMathResponseScoringFn,
+)
 from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn
 from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn

-FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn]
+FIXED_FNS = [
+    EqualityScoringFn,
+    SubsetOfScoringFn,
+    RegexParserScoringFn,
+    RegexParserMathResponseScoringFn,
+    BFCLScoringFn,
+    IfEvalScoringFn,
+    DocVQAScoringFn,
+]


 class BasicScoringImpl(
@ -74,12 +88,12 @@ class BasicScoringImpl(
        dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
        validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.scoring.value))

-        all_rows = await self.datasetio_api.get_rows_paginated(
+        all_rows = await self.datasetio_api.iterrows(
            dataset_id=dataset_id,
-            rows_in_page=-1,
+            limit=-1,
        )
        res = await self.score(
-            input_rows=all_rows.rows,
+            input_rows=all_rows.data,
            scoring_functions=scoring_functions,
        )
        if save_results_dataset:
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import re
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from ..utils.bfcl.ast_parser import decode_ast
+from ..utils.bfcl.checker import ast_checker, is_empty_output
+from .fn_defs.bfcl import bfcl
+
+
+def postprocess(x: Dict[str, Any], test_category: str) -> Dict[str, Any]:
+    contain_func_call = False
+    error = None
+    error_type = None
+    checker_result = {}
+    try:
+        prediction = decode_ast(x["generated_answer"], x["language"]) or ""
+        contain_func_call = True
+        # if not is_function_calling_format_output(prediction):
+        if is_empty_output(prediction):
+            contain_func_call = False
+            error = "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
+            error_type = "ast_decoder:decoder_wrong_output_format"
+        else:
+            checker_result = ast_checker(
+                json.loads(x["function"]),
+                prediction,
+                json.loads(x["ground_truth"]),
+                x["language"],
+                test_category=test_category,
+                model_name="",
+            )
+    except Exception as e:
+        prediction = ""
+        error = f"Invalid syntax. Failed to decode AST. {str(e)}"
+        error_type = "ast_decoder:decoder_failed"
+    return {
+        "prediction": prediction,
+        "contain_func_call": contain_func_call,
+        "valid": checker_result.get("valid", False),
+        "error": error or checker_result.get("error", ""),
+        "error_type": error_type or checker_result.get("error_type", ""),
+    }
+
+
+def gen_valid(x: Dict[str, Any]) -> Dict[str, float]:
+    return {"valid": x["valid"]}
+
+
+def gen_relevance_acc(x: Dict[str, Any]) -> Dict[str, float]:
+    # This function serves for both relevance and irrelevance tests, which share the exact opposite logic.
+    # If `test_category` is "irrelevance", the model is expected to output no function call.
+    # No function call means either the AST decoding fails (a error message is generated) or the decoded AST does not contain any function call (such as a empty list, `[]`).
+    # If `test_category` is "relevance", the model is expected to output to a function call, and empty list doesn't count as a function call.
+    acc = not x["contain_func_call"] if "irrelevance" in x["id"] else x["contain_func_call"]
+    return {"valid": float(acc)}
+
+
+class BFCLScoringFn(RegisteredBaseScoringFn):
+    """
+    A scoring_fn for BFCL
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            bfcl.identifier: bfcl,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = "bfcl",
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        test_category = re.sub(r"_[0-9_-]+$", "", input_row["id"])
+        score_result = postprocess(input_row, test_category)
+        if test_category in {"irrelevance", "live_relevance", "live_irrelevance"}:
+            score = gen_relevance_acc(score_result)["valid"]
+        else:
+            score = gen_valid(score_result)["valid"]
+        return {
+            "score": float(score),
+        }
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
@ -0,0 +1,240 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import re
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from .fn_defs.docvqa import docvqa
+
+CONTRACTIONS = {
+    "aint": "ain't",
+    "arent": "aren't",
+    "cant": "can't",
+    "couldve": "could've",
+    "couldnt": "couldn't",
+    "couldn'tve": "couldn't've",
+    "couldnt've": "couldn't've",
+    "didnt": "didn't",
+    "doesnt": "doesn't",
+    "dont": "don't",
+    "hadnt": "hadn't",
+    "hadnt've": "hadn't've",
+    "hadn'tve": "hadn't've",
+    "hasnt": "hasn't",
+    "havent": "haven't",
+    "hed": "he'd",
+    "hed've": "he'd've",
+    "he'dve": "he'd've",
+    "hes": "he's",
+    "howd": "how'd",
+    "howll": "how'll",
+    "hows": "how's",
+    "Id've": "I'd've",
+    "I'dve": "I'd've",
+    "Im": "I'm",
+    "Ive": "I've",
+    "isnt": "isn't",
+    "itd": "it'd",
+    "itd've": "it'd've",
+    "it'dve": "it'd've",
+    "itll": "it'll",
+    "let's": "let's",
+    "maam": "ma'am",
+    "mightnt": "mightn't",
+    "mightnt've": "mightn't've",
+    "mightn'tve": "mightn't've",
+    "mightve": "might've",
+    "mustnt": "mustn't",
+    "mustve": "must've",
+    "neednt": "needn't",
+    "notve": "not've",
+    "oclock": "o'clock",
+    "oughtnt": "oughtn't",
+    "ow's'at": "'ow's'at",
+    "'ows'at": "'ow's'at",
+    "'ow'sat": "'ow's'at",
+    "shant": "shan't",
+    "shed've": "she'd've",
+    "she'dve": "she'd've",
+    "she's": "she's",
+    "shouldve": "should've",
+    "shouldnt": "shouldn't",
+    "shouldnt've": "shouldn't've",
+    "shouldn'tve": "shouldn't've",
+    "somebody'd": "somebodyd",
+    "somebodyd've": "somebody'd've",
+    "somebody'dve": "somebody'd've",
+    "somebodyll": "somebody'll",
+    "somebodys": "somebody's",
+    "someoned": "someone'd",
+    "someoned've": "someone'd've",
+    "someone'dve": "someone'd've",
+    "someonell": "someone'll",
+    "someones": "someone's",
+    "somethingd": "something'd",
+    "somethingd've": "something'd've",
+    "something'dve": "something'd've",
+    "somethingll": "something'll",
+    "thats": "that's",
+    "thered": "there'd",
+    "thered've": "there'd've",
+    "there'dve": "there'd've",
+    "therere": "there're",
+    "theres": "there's",
+    "theyd": "they'd",
+    "theyd've": "they'd've",
+    "they'dve": "they'd've",
+    "theyll": "they'll",
+    "theyre": "they're",
+    "theyve": "they've",
+    "twas": "'twas",
+    "wasnt": "wasn't",
+    "wed've": "we'd've",
+    "we'dve": "we'd've",
+    "weve": "we've",
+    "werent": "weren't",
+    "whatll": "what'll",
+    "whatre": "what're",
+    "whats": "what's",
+    "whatve": "what've",
+    "whens": "when's",
+    "whered": "where'd",
+    "wheres": "where's",
+    "whereve": "where've",
+    "whod": "who'd",
+    "whod've": "who'd've",
+    "who'dve": "who'd've",
+    "wholl": "who'll",
+    "whos": "who's",
+    "whove": "who've",
+    "whyll": "why'll",
+    "whyre": "why're",
+    "whys": "why's",
+    "wont": "won't",
+    "wouldve": "would've",
+    "wouldnt": "wouldn't",
+    "wouldnt've": "wouldn't've",
+    "wouldn'tve": "wouldn't've",
+    "yall": "y'all",
+    "yall'll": "y'all'll",
+    "y'allll": "y'all'll",
+    "yall'd've": "y'all'd've",
+    "y'alld've": "y'all'd've",
+    "y'all'dve": "y'all'd've",
+    "youd": "you'd",
+    "youd've": "you'd've",
+    "you'dve": "you'd've",
+    "youll": "you'll",
+    "youre": "you're",
+    "youve": "you've",
+    "1st": "first",
+    "2nd": "second",
+    "3rd": "third",
+}
+NUMBERS = {
+    "none": "0",
+    "zero": "0",
+    "one": "1",
+    "two": "2",
+    "three": "3",
+    "four": "4",
+    "five": "5",
+    "six": "6",
+    "seven": "7",
+    "eight": "8",
+    "nine": "9",
+    "ten": "10",
+}
+ARTICLES = [
+    "a",
+    "an",
+    "the",
+    "to",
+    "in",
+    "from",
+    "by",
+]  # Contains a bit more than just articles, but we want to get rid of these elements influencing the accuracy
+PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
+COMMA_STRIP = re.compile(r"(\d)(\,)(\d)")
+PUNCTUATION = [
+    ";",
+    r"/",
+    "[",
+    "]",
+    '"',
+    "{",
+    "}",
+    "(",
+    ")",
+    "=",
+    "+",
+    "\\",
+    "_",
+    "-",
+    ">",
+    "<",
+    "@",
+    "`",
+    ",",
+    "?",
+    "!",
+]
+
+
+def normalize_answer(s: str) -> str:
+    # process punctuation
+    for p in PUNCTUATION:
+        if (p + " " in s or " " + p in s) or (re.search(COMMA_STRIP, s) is not None):
+            s = s.replace(p, "")
+        else:
+            s = s.replace(p, " ")
+        s = PERIOD_STRIP.sub("", s, re.UNICODE)
+
+    # process digits and articles
+    temp_text = s.lower().split()
+    out_text = []
+    for word in temp_text:
+        word = NUMBERS.setdefault(word, word)
+        if word not in ARTICLES:
+            out_text.append(word)
+
+    # standardize contractions
+    for word_id, word in enumerate(out_text):
+        if word in CONTRACTIONS:
+            out_text[word_id] = CONTRACTIONS[word]
+    return " ".join(out_text)
+
+
+class DocVQAScoringFn(RegisteredBaseScoringFn):
+    """
+    docvqa basically matches the generated answer against several allowed
+    choices, but we need to normalize the answer to avoid penalizing
+    trivial differences
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            docvqa.identifier: docvqa,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = "docvqa",
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        expected_answers = json.loads(input_row["expected_answer"])
+        generated_answer = input_row["generated_answer"]
+        score = 1.0 if normalize_answer(generated_answer) in [normalize_answer(s) for s in expected_answers] else 0.0
+        return {
+            "score": score,
+        }
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+bfcl = ScoringFn(
+    identifier="basic::bfcl",
+    description="BFCL complex scoring",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="bfcl",
+    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
+)
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+docvqa = ScoringFn(
+    identifier="basic::docvqa",
+    description="DocVQA Visual Question & Answer scoring function",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="docvqa",
+    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
+)
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+ifeval = ScoringFn(
+    identifier="basic::ifeval",
+    description="Eval intruction follow capacity by checkping how many instructions can be followed in each example",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="ifeval",
+    params=BasicScoringFnParams(
+        aggregation_functions=[AggregationFunctionType.weighted_average],
+    ),
+)
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    RegexParserScoringFnParams,
+    ScoringFn,
+)
+
+MATH_ANSWER_REGEXES = [r".*final answer is:?\s*\$\\boxed{(?P<X>.*)}\$"]
+
+
+regex_parser_math_response = ScoringFn(
+    identifier="basic::regex_parser_math_response",
+    description="For math related benchmarks, extract answer from the generated response and expected_answer and see if they match",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="regex-parser-math-response",
+    params=RegexParserScoringFnParams(
+        parsing_regexes=MATH_ANSWER_REGEXES,
+        aggregation_functions=[AggregationFunctionType.accuracy],
+    ),
+)
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
@ -12,6 +12,7 @@ from llama_stack.apis.scoring_functions import (
 )

 MULTILINGUAL_ANSWER_REGEXES = [
+    r"The best answer is ",
    r"Answer\s*:",
    r"Answer\s*:",  # Korean invisible character
    r"উত্তর\s*:",
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
@ -0,0 +1,79 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST
+from .fn_defs.ifeval import (
+    ifeval,
+)
+
+
+class IfEvalScoringFn(RegisteredBaseScoringFn):
+    """
+    A scoring_fn Instruction-Following Eval (IFEval) benchmark
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            ifeval.identifier: ifeval,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = None,
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
+        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
+        if scoring_params is not None:
+            fn_def.params = scoring_params
+
+        instruction_list = input_row["instruction_id_list"]
+        generated_answer = input_row["generated_answer"].strip()
+
+        is_following_list = []
+        results = dict(
+            {k + "_correct": 0.0 for k in INSTRUCTION_LIST},
+            **{k + "_total": 0.0 for k in INSTRUCTION_LIST},
+        )
+
+        for index, instruction_id in enumerate(instruction_list):
+            instruction_cls = INSTRUCTION_DICT[instruction_id]
+            instruction = instruction_cls(instruction_id)
+            results[instruction_id + "_total"] += 1.0
+            results[instruction_id.split(":")[0] + "_total"] += 1.0
+
+            clean_input_row = {k: v for k, v in input_row["kwargs"][index].items() if v is not None}
+            print(clean_input_row)
+            instruction.build_description(**clean_input_row)
+            args = instruction.get_instruction_args()
+            if args and "prompt" in args:
+                instruction.build_description(prompt=input_row["prompt"])
+
+            if generated_answer and instruction.check_following(generated_answer):
+                is_following_list.append(True)
+                results[instruction_id + "_correct"] += 1.0
+                results[instruction_id.split(":")[0] + "_correct"] += 1.0
+            else:
+                is_following_list.append(False)
+
+        if len(is_following_list) == 0:
+            return {
+                "score": 0.0,
+                "weight": 0.0,
+            }
+
+        return {
+            "score": float(sum(is_following_list)) / float(len(is_following_list)),
+            "weight": float(len(is_following_list)),
+        }
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from ..utils.math_utils import first_answer, normalize_final_answer, try_evaluate_frac, try_evaluate_latex
+from .fn_defs.regex_parser_math_response import (
+    regex_parser_math_response,
+)
+
+
+class RegexParserMathResponseScoringFn(RegisteredBaseScoringFn):
+    """
+    A scoring_fn for math benchamrks that parses answer from generated response according to context and check match with expected_answer.
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            regex_parser_math_response.identifier: regex_parser_math_response,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = None,
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
+        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
+        if scoring_params is not None:
+            fn_def.params = scoring_params
+
+        assert fn_def.params is not None and fn_def.params.type == ScoringFnParamsType.regex_parser.value, (
+            f"RegexParserScoringFnParams not found for {fn_def}."
+        )
+
+        expected_answer = input_row["expected_answer"]
+        generated_answer = input_row["generated_answer"]
+
+        parsing_regexes = fn_def.params.parsing_regexes
+        assert len(parsing_regexes) == 1, (
+            "Only one parsing regex is supported for regex_parser_math_response scoring function."
+        )
+        parsing_regexes = fn_def.params.parsing_regexes[0]
+
+        normalized_generated_answer = normalize_final_answer(
+            first_answer(generated_answer),
+            parsing_regexes,
+            match_first=True,
+        )
+        normalized_generated_answer = try_evaluate_frac(try_evaluate_latex(normalized_generated_answer))
+
+        normalized_expected_answer = normalize_final_answer(expected_answer, r".*")
+        normalized_expected_answer = try_evaluate_frac(try_evaluate_latex(normalized_expected_answer))
+
+        score = 1.0 if normalized_generated_answer == normalized_expected_answer else 0.0
+        return {
+            "score": score,
+        }
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/init.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/init.py
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
@ -0,0 +1,296 @@
+# ruff: noqa
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import ast
+
+from .tree_sitter import get_parser
+
+
+def parse_java_function_call(source_code):
+    if not source_code.endswith(";"):
+        source_code += ";"  # Necessary for the parser not to register an error
+    parser = get_parser("java")
+    tree = parser.parse(bytes(source_code, "utf8"))
+    root_node = tree.root_node
+
+    if root_node.has_error:
+        raise Exception("Error parsing java the source code.")
+
+    def get_text(node):
+        """Returns the text represented by the node."""
+        return source_code[node.start_byte : node.end_byte]
+
+    def traverse_node(node, nested=False):
+        if node.type == "string_literal":
+            if nested:
+                return get_text(node)
+            # Strip surrounding quotes from string literals
+            return get_text(node)[1:-1]
+        elif node.type == "character_literal":
+            if nested:
+                return get_text(node)
+            # Strip surrounding single quotes from character literals
+            return get_text(node)[1:-1]
+        """Traverse the node to collect texts for complex structures."""
+        if node.type in [
+            "identifier",
+            "class_literal",
+            "type_identifier",
+            "method_invocation",
+        ]:
+            return get_text(node)
+        elif node.type == "array_creation_expression":
+            # Handle array creation expression specifically
+            type_node = node.child_by_field_name("type")
+            value_node = node.child_by_field_name("value")
+            type_text = traverse_node(type_node, True)
+            value_text = traverse_node(value_node, True)
+            return f"new {type_text}[]{value_text}"
+        elif node.type == "object_creation_expression":
+            # Handle object creation expression specifically
+            type_node = node.child_by_field_name("type")
+            arguments_node = node.child_by_field_name("arguments")
+            type_text = traverse_node(type_node, True)
+            if arguments_node:
+                # Process each argument carefully, avoiding unnecessary punctuation
+                argument_texts = []
+                for child in arguments_node.children:
+                    if child.type not in [
+                        ",",
+                        "(",
+                        ")",
+                    ]:  # Exclude commas and parentheses
+                        argument_text = traverse_node(child, True)
+                        argument_texts.append(argument_text)
+                arguments_text = ", ".join(argument_texts)
+                return f"new {type_text}({arguments_text})"
+            else:
+                return f"new {type_text}()"
+        elif node.type == "set":
+            # Handling sets specifically
+            items = [traverse_node(n, True) for n in node.children if n.type not in [",", "set"]]
+            return "{" + ", ".join(items) + "}"
+
+        elif node.child_count > 0:
+            return "".join(traverse_node(child, True) for child in node.children)
+        else:
+            return get_text(node)
+
+    def extract_arguments(args_node):
+        arguments = {}
+        for child in args_node.children:
+            if child.type == "assignment_expression":
+                # For named parameters
+                name_node, value_node = child.children[0], child.children[2]
+                name = get_text(name_node)
+                value = traverse_node(value_node)
+                if name in arguments:
+                    if not isinstance(arguments[name], list):
+                        arguments[name] = [arguments[name]]
+                    arguments[name].append(value)
+                else:
+                    arguments[name] = value
+                # arguments.append({'name': name, 'value': value})
+            elif child.type in ["identifier", "class_literal", "set"]:
+                # For unnamed parameters and handling sets
+                value = traverse_node(child)
+                if None in arguments:
+                    if not isinstance(arguments[None], list):
+                        arguments[None] = [arguments[None]]
+                    arguments[None].append(value)
+                else:
+                    arguments[None] = value
+        return arguments
+
+    def traverse(node):
+        if node.type == "method_invocation":
+            # Extract the function name and its arguments
+            method_name = get_text(node.child_by_field_name("name"))
+            class_name_node = node.child_by_field_name("object")
+            if class_name_node:
+                class_name = get_text(class_name_node)
+                function_name = f"{class_name}.{method_name}"
+            else:
+                function_name = method_name
+            arguments_node = node.child_by_field_name("arguments")
+            if arguments_node:
+                arguments = extract_arguments(arguments_node)
+                for key, value in arguments.items():
+                    if isinstance(value, list):
+                        raise Exception("Error: Multiple arguments with the same name are not supported.")
+                return [{function_name: arguments}]
+
+        else:
+            for child in node.children:
+                result = traverse(child)
+                if result:
+                    return result
+
+    result = traverse(root_node)
+    return result if result else {}
+
+
+def parse_javascript_function_call(source_code):
+    if not source_code.endswith(";"):
+        source_code += ";"  # Necessary for the parser not to register an error
+    parser = get_parser("javascript")
+    # Parse the source code
+    tree = parser.parse(bytes(source_code, "utf8"))
+    root_node = tree.root_node
+    if root_node.has_error:
+        raise Exception("Error js parsing the source code.")
+
+    # Function to recursively extract argument details
+    def extract_arguments(node):
+        args = {}
+        for child in node.children:
+            if child.type == "assignment_expression":
+                # Extract left (name) and right (value) parts of the assignment
+                name = child.children[0].text.decode("utf-8")
+                value = child.children[2].text.decode("utf-8")
+                if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
+                    value = value[1:-1]  # Trim the quotation marks
+                if name in args:
+                    if not isinstance(args[name], list):
+                        args[name] = [args[name]]
+                    args[name].append(value)
+                else:
+                    args[name] = value
+
+            elif child.type == "identifier" or child.type == "true":
+                # Handle non-named arguments and boolean values
+                value = child.text.decode("utf-8")
+                if None in args:
+                    if not isinstance(args[None], list):
+                        args[None] = [args[None]]
+                    args[None].append(value)
+                else:
+                    args[None] = value
+        return args
+
+    # Find the function call and extract its name and arguments
+    if root_node.type == "program":
+        for child in root_node.children:
+            if child.type == "expression_statement":
+                for sub_child in child.children:
+                    if sub_child.type == "call_expression":
+                        function_name = sub_child.children[0].text.decode("utf8")
+                        arguments_node = sub_child.children[1]
+                        parameters = extract_arguments(arguments_node)
+                        for key, value in parameters.items():
+                            if isinstance(value, list):
+                                raise Exception("Error: Multiple arguments with the same name are not supported.")
+                        result = [{function_name: parameters}]
+                        return result
+
+
+def ast_parse(input_str, language="Python"):
+    if language == "Python":
+        cleaned_input = input_str.strip("[]'")
+        parsed = ast.parse(cleaned_input, mode="eval")
+        extracted = []
+        if isinstance(parsed.body, ast.Call):
+            extracted.append(resolve_ast_call(parsed.body))
+        else:
+            for elem in parsed.body.elts:
+                extracted.append(resolve_ast_call(elem))
+        return extracted
+    elif language == "Java":
+        return parse_java_function_call(input_str[1:-1])  # Remove the [ and ] from the string
+    elif language == "JavaScript":
+        return parse_javascript_function_call(input_str[1:-1])
+    else:
+        raise NotImplementedError(f"Unsupported language: {language}")
+
+
+def resolve_ast_call(elem):
+    # Handle nested attributes for deeply nested module paths
+    func_parts = []
+    func_part = elem.func
+    while isinstance(func_part, ast.Attribute):
+        func_parts.append(func_part.attr)
+        func_part = func_part.value
+    if isinstance(func_part, ast.Name):
+        func_parts.append(func_part.id)
+    func_name = ".".join(reversed(func_parts))
+    args_dict = {}
+    # Parse when args are simply passed as an unnamed dictionary arg
+    for arg in elem.args:
+        if isinstance(arg, ast.Dict):
+            for key, value in zip(arg.keys, arg.values):
+                if isinstance(key, ast.Constant):
+                    arg_name = key.value
+                output = resolve_ast_by_type(value)
+                args_dict[arg_name] = output
+    for arg in elem.keywords:
+        output = resolve_ast_by_type(arg.value)
+        args_dict[arg.arg] = output
+    return {func_name: args_dict}
+
+
+def resolve_ast_by_type(value):
+    if isinstance(value, ast.Constant):
+        if value.value is Ellipsis:
+            output = "..."
+        else:
+            output = value.value
+    elif isinstance(value, ast.UnaryOp):
+        output = -value.operand.value
+    elif isinstance(value, ast.List):
+        output = [resolve_ast_by_type(v) for v in value.elts]
+    elif isinstance(value, ast.Dict):
+        output = {resolve_ast_by_type(k): resolve_ast_by_type(v) for k, v in zip(value.keys, value.values)}
+    elif isinstance(value, ast.NameConstant):  # Added this condition to handle boolean values
+        output = value.value
+    elif isinstance(value, ast.BinOp):  # Added this condition to handle function calls as arguments
+        output = eval(ast.unparse(value))
+    elif isinstance(value, ast.Name):
+        output = value.id
+    elif isinstance(value, ast.Call):
+        if len(value.keywords) == 0:
+            output = ast.unparse(value)
+        else:
+            output = resolve_ast_call(value)
+    elif isinstance(value, ast.Tuple):
+        output = tuple(resolve_ast_by_type(v) for v in value.elts)
+    elif isinstance(value, ast.Lambda):
+        output = eval(ast.unparse(value.body[0].value))
+    elif isinstance(value, ast.Ellipsis):
+        output = "..."
+    elif isinstance(value, ast.Subscript):
+        try:
+            output = ast.unparse(value.body[0].value)
+        except:
+            output = ast.unparse(value.value) + "[" + ast.unparse(value.slice) + "]"
+    else:
+        raise Exception(f"Unsupported AST type: {type(value)}")
+    return output
+
+
+def decode_ast(result, language="Python"):
+    func = result
+    func = func.replace("\n", "")  # remove new line characters
+    if not func.startswith("["):
+        func = "[" + func
+    if not func.endswith("]"):
+        func = func + "]"
+    decoded_output = ast_parse(func, language)
+    return decoded_output
+
+
+def decode_execute(result):
+    func = result
+    func = func.replace("\n", "")  # remove new line characters
+    if not func.startswith("["):
+        func = "[" + func
+    if not func.endswith("]"):
+        func = func + "]"
+    decode_output = ast_parse(func)
+    execution_list = []
+    for function_call in decode_output:
+        for key, value in function_call.items():
+            execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})")
+    return execution_list
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
@ -0,0 +1,989 @@
+# ruff: noqa
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import json
+import re
+import time
+from typing import Any
+
+# Comment out for now until we actually use the rest checker in evals
+# import requests  # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function.
+
+
+class NoAPIKeyError(Exception):
+    def __init__(self):
+        self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate."
+        super().__init__(self.message)
+
+
+REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2
+
+
+JAVA_TYPE_CONVERSION = {
+    "byte": int,
+    "short": int,
+    "integer": int,
+    "float": float,
+    "double": float,
+    "long": int,
+    "boolean": bool,
+    "char": str,
+    "Array": list,
+    "ArrayList": list,
+    "Set": set,
+    "HashMap": dict,
+    "Hashtable": dict,
+    "Queue": list,  # this can be `queue.Queue` as well, for simplicity we check with list
+    "Stack": list,
+    "String": str,
+    "any": str,
+}
+
+JS_TYPE_CONVERSION = {
+    "String": str,
+    "integer": int,
+    "float": float,
+    "Bigint": int,
+    "Boolean": bool,
+    "dict": dict,
+    "array": list,
+    "any": str,
+}
+
+# We switch to conditional import for the following two imports to avoid unnecessary installations.
+# User doesn't need to setup the tree-sitter packages if they are not running the test for that language.
+# from js_type_converter import js_type_converter
+# from java_type_converter import java_type_converter
+
+PYTHON_TYPE_MAPPING = {
+    "string": str,
+    "integer": int,
+    "float": float,
+    "boolean": bool,
+    "array": list,
+    "tuple": list,
+    "dict": dict,
+    "any": str,
+}
+
+# This is the list of types that we need to recursively check its values
+PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"]
+
+
+NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"]
+
+
+#### Helper functions for AST ####
+def find_description(func_descriptions, name):
+    if type(func_descriptions) == list:
+        for func_description in func_descriptions:
+            if func_description["name"] == name:
+                return func_description
+        return None
+    else:
+        # it is a dict, there is only one function
+        return func_descriptions
+
+
+def get_possible_answer_type(possible_answer: list):
+    for answer in possible_answer:
+        if answer != "":  # Optional parameter
+            return type(answer)
+    return None
+
+
+def type_checker(
+    param: str,
+    value,
+    possible_answer: list,
+    expected_type_description: str,
+    expected_type_converted,
+    nested_type_converted,
+):
+    # NOTE: This type checker only supports nested type checking for one level deep.
+    # We didn't implement recursive type checking for nested types, as it's not needed for the current use case and it's very complex.
+
+    result: Any = {
+        "valid": True,
+        "error": [],
+        "is_variable": False,
+        "error_type": "type_error:simple",
+    }
+
+    is_variable = False
+    # check for the case where a variable is used instead of a actual value.
+    # use the type in possible_answer as the expected type
+    possible_answer_type = get_possible_answer_type(possible_answer)
+    # if possible_answer only contains optional parameters, we can't determine the type
+    if possible_answer_type != None:
+        # we are being precise here.
+        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
+        if possible_answer_type != expected_type_converted:
+            is_variable = True
+
+    # value is the same type as in function description
+    if type(value) == expected_type_converted:
+        # We don't need to do recursive check for simple types
+        if nested_type_converted == None:
+            result["is_variable"] = is_variable
+            return result
+        else:
+            for possible_answer_item in possible_answer:
+                flag = True  # Each parameter should match to at least one possible answer type.
+                # Here, we assume that each item should be the same type. We could also relax it.
+                if type(possible_answer_item) == list:
+                    for value_item in value:
+                        checker_result = type_checker(
+                            param,
+                            value_item,
+                            possible_answer_item,
+                            str(nested_type_converted),
+                            nested_type_converted,
+                            None,
+                        )
+                        if not checker_result["valid"]:
+                            flag = False
+                            break
+
+                if flag:
+                    return {"valid": True, "error": [], "is_variable": is_variable}
+
+            result["valid"] = False
+            result["error"] = [
+                f"Nested type checking failed for parameter {repr(param)}. Expected outer type {expected_type_description} with inner type {str(nested_type_converted)}. Parameter value: {repr(value)}."
+            ]
+            result["error_type"] = "type_error:nested"
+
+    # value is not as expected, check for the case where a variable is used instead of a actual value
+    # use the type in possible_answer as the expected type
+    possible_answer_type = get_possible_answer_type(possible_answer)
+    # if possible_answer only contains optional parameters, we can't determine the type
+    if possible_answer_type != None:
+        # we are being precise here.
+        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
+        if type(value) == possible_answer_type:
+            result["is_variable"] = True
+            return result
+
+    result["valid"] = False
+    result["error"].append(
+        f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}."
+    )
+    result["error_type"] = "type_error:simple"
+    return result
+
+
+def standardize_string(input_string: str):
+    # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase
+    # It will also convert all the single quotes to double quotes
+    # This is used to compare the model output with the possible answers
+    # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024
+    regex_string = r"[ \,\.\/\-\_\*\^]"
+    return re.sub(regex_string, "", input_string).lower().replace("'", '"')
+
+
+def string_checker(param: str, model_output: str, possible_answer: list):
+    standardize_possible_answer = []
+    standardize_model_output = standardize_string(model_output)
+    for i in range(len(possible_answer)):
+        if type(possible_answer[i]) == str:
+            standardize_possible_answer.append(standardize_string(possible_answer[i]))
+
+    if standardize_model_output not in standardize_possible_answer:
+        return {
+            "valid": False,
+            "error": [
+                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive."
+            ],
+            "error_type": "value_error:string",
+        }
+
+    return {"valid": True, "error": []}
+
+
+def list_checker(param: str, model_output: list, possible_answer: list):
+    # Convert the tuple to a list
+
+    standardize_model_output = list(model_output)
+
+    # If the element in the list is a string, we need to standardize it
+    for i in range(len(standardize_model_output)):
+        if type(standardize_model_output[i]) == str:
+            standardize_model_output[i] = standardize_string(model_output[i])
+
+    standardize_possible_answer: Any = []
+    # We also need to standardize the possible answers
+    for i in range(len(possible_answer)):
+        standardize_possible_answer.append([])
+        for j in range(len(possible_answer[i])):
+            if type(possible_answer[i][j]) == str:
+                standardize_possible_answer[i].append(standardize_string(possible_answer[i][j]))
+            else:
+                standardize_possible_answer[i].append(possible_answer[i][j])
+
+    if standardize_model_output not in standardize_possible_answer:
+        return {
+            "valid": False,
+            "error": [
+                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}."
+            ],
+            "error_type": "value_error:list/tuple",
+        }
+
+    return {"valid": True, "error": []}
+
+
+def dict_checker(param: str, model_output: dict, possible_answers: list):
+    # This function works for simple dictionaries, but not dictionaries with nested dictionaries.
+    # The current dataset only contains simple dictionaries, so this is sufficient.
+
+    result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
+    for i in range(len(possible_answers)):
+        if possible_answers[i] == "":
+            continue
+
+        result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
+
+        flag = True
+
+        possible_answer = possible_answers[i]
+        # possible_anwer is a single dictionary
+
+        for key, value in model_output.items():
+            if key not in possible_answer:
+                result["valid"] = False
+                result["error"].append(f"Unexpected dict key parameter: '{key}'.")  # type: ignore[attr-defined]
+                result["error_type"] = "value_error:dict_key"
+                flag = False
+                break
+
+            standardize_value = value
+            # If the value is a string, we need to standardize it
+            if type(value) == str:
+                standardize_value = standardize_string(value)
+
+            # We also need to standardize the possible answers if they are string
+            standardize_possible_answer = []
+            for i in range(len(possible_answer[key])):
+                if type(possible_answer[key][i]) == str:
+                    standardize_possible_answer.append(standardize_string(possible_answer[key][i]))
+                else:
+                    standardize_possible_answer.append(possible_answer[key][i])
+
+            if standardize_value not in standardize_possible_answer:
+                result["valid"] = False
+                result["error"].append(  # type: ignore[attr-defined]
+                    f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}."
+                )
+                result["error_type"] = "value_error:dict_value"
+                flag = False
+                break
+
+        for key, value in possible_answer.items():
+            if key not in model_output and "" not in value:
+                result["valid"] = False
+                result["error"].append(f"Missing dict key parameter: '{key}'.")  # type: ignore[attr-defined]
+                result["error_type"] = "value_error:dict_key"
+                flag = False
+                break
+
+        if flag:
+            return {"valid": True, "error": []}
+
+    return result
+
+
+def list_dict_checker(param: str, model_output: list, possible_answers: list):
+    # This function takes in a list of dictionaries and checks if each dictionary is valid
+    # The order of the dictionaries in the list must match the order of the possible answers
+
+    result = {"valid": False, "error": [], "error_type": "list_dict_checker:unclear"}
+
+    for answer_index in range(len(possible_answers)):
+        flag = True  # True means so far, all dictionaries are valid
+
+        # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers
+        if len(model_output) != len(possible_answers[answer_index]):
+            result["valid"] = False
+            result["error"] = ["Wrong number of dictionaries in the list."]
+            result["error_type"] = "value_error:list_dict_count"
+            flag = False
+            continue
+
+        for dict_index in range(len(model_output)):
+            result = dict_checker(
+                param,
+                model_output[dict_index],
+                [possible_answers[answer_index][dict_index]],
+            )
+            if not result["valid"]:
+                flag = False
+                break
+        if flag:
+            return {"valid": True, "error": []}
+
+    return result
+
+
+def simple_function_checker(
+    func_description: dict,
+    model_output: dict,
+    possible_answer: dict,
+    language: str,
+    model_name: str,
+):
+    possible_answer = list(possible_answer.values())[0]
+    # Extract function name and parameters details
+    func_name = func_description["name"]
+    param_details = func_description["parameters"]["properties"]
+    required_params = func_description["parameters"]["required"]
+
+    # Initialize a result dictionary
+    result = {
+        "valid": True,
+        "error": [],
+        "error_type": "simple_function_checker:unclear",
+    }
+
+    # Check if function name matches
+    if func_name not in model_output:
+        result["valid"] = False
+        result["error"].append(  # type: ignore[attr-defined]
+            f"Function name {repr(func_name)} not found in model output."
+        )
+        result["error_type"] = "simple_function_checker:wrong_func_name"
+        return result
+
+    model_params = model_output[func_name]
+
+    # Check for required parameters in model output
+    for param in required_params:
+        if param not in model_params:
+            result["valid"] = False
+            result["error"].append(f"Missing required parameter: {repr(param)}.")  # type: ignore[attr-defined]
+            result["error_type"] = "simple_function_checker:missing_required"
+            return result
+
+    # Validate types and values for each parameter in model output
+    for param, value in model_params.items():
+        if param not in param_details or param not in possible_answer:
+            result["valid"] = False
+            result["error"].append(f"Unexpected parameter: {repr(param)}.")  # type: ignore[attr-defined]
+            result["error_type"] = "simple_function_checker:unexpected_param"
+            return result
+
+        full_param_details = param_details[param]
+        expected_type_description = full_param_details["type"]  # This is a string
+        is_variable = False
+        nested_type_converted = None
+
+        if language == "Java":
+            from evals.utils.bfcl.java_type_converter import java_type_converter
+
+            expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description]
+
+            if expected_type_description in JAVA_TYPE_CONVERSION:
+                if type(value) != str:
+                    result["valid"] = False
+                    result["error"].append(  # type: ignore[attr-defined]
+                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
+                    )
+                    result["error_type"] = "type_error:java"
+                    return result
+
+                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
+                    nested_type = param_details[param]["items"]["type"]
+                    nested_type_converted = JAVA_TYPE_CONVERSION[nested_type]
+                    value = java_type_converter(value, expected_type_description, nested_type)
+                else:
+                    value = java_type_converter(value, expected_type_description)
+
+        elif language == "JavaScript":
+            from evals.utils.bfcl.js_type_converter import js_type_converter
+
+            expected_type_converted = JS_TYPE_CONVERSION[expected_type_description]
+
+            if expected_type_description in JS_TYPE_CONVERSION:
+                if type(value) != str:
+                    result["valid"] = False
+                    result["error"].append(  # type: ignore[attr-defined]
+                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
+                    )
+                    result["error_type"] = "type_error:js"
+                    return result
+
+                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
+                    nested_type = param_details[param]["items"]["type"]
+                    nested_type_converted = JS_TYPE_CONVERSION[nested_type]
+                    value = js_type_converter(value, expected_type_description, nested_type)
+                else:
+                    value = js_type_converter(value, expected_type_description)
+
+        elif language == "Python":
+            expected_type_converted = PYTHON_TYPE_MAPPING[expected_type_description]
+            if expected_type_description in PYTHON_NESTED_TYPE_CHECK_LIST:
+                nested_type = param_details[param]["items"]["type"]
+                nested_type_converted = PYTHON_TYPE_MAPPING[nested_type]
+
+        # We convert all tuple value to list when the expected type is tuple.
+        # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load().
+        # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future.
+        if expected_type_description == "tuple" and type(value) == tuple:
+            value = list(value)
+
+        # Allow python auto conversion from int to float
+        if language == "Python" and expected_type_description == "float" and type(value) == int:
+            value = float(value)
+
+        # Type checking
+        # In fact, we only check for Python here.
+        # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct.
+        type_check_result = type_checker(
+            param,
+            value,
+            possible_answer[param],
+            expected_type_description,
+            expected_type_converted,
+            nested_type_converted,
+        )
+        is_variable = type_check_result["is_variable"]
+        if not type_check_result["valid"]:
+            return type_check_result
+
+        # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable.
+        # We can just treat the variable as a string and use the normal flow.
+        if not is_variable:
+            # Special handle for dictionaries
+            if expected_type_converted == dict:
+                result = dict_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            # Special handle for list of dictionaries
+            elif expected_type_converted == list and nested_type_converted == dict:
+                result = list_dict_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            # Special handle for strings
+            elif expected_type_converted == str:
+                # We don't check for case sensitivity for string, as long as it's not a variable
+                result = string_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            elif expected_type_converted == list:
+                result = list_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+        # Check if the value is within the possible answers
+        if value not in possible_answer[param]:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}."
+            )
+            result["error_type"] = "value_error:others"
+            return result
+
+    # Check for optional parameters not provided but allowed
+    for param in possible_answer:
+        if param not in model_params and "" not in possible_answer[param]:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Optional parameter {repr(param)} not provided and not marked as optional."
+            )
+            result["error_type"] = "simple_function_checker:missing_optional"
+            return result
+
+    return result
+
+
+def parallel_function_checker_enforce_order(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: dict,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "parallel_function_checker_enforce_order:wrong_count",
+        }
+
+    func_name_list = list(possible_answers.keys())
+    possible_answers_list = []
+
+    for key, value in possible_answers.items():
+        possible_answers_list.append({key: value})
+
+    for i in range(len(possible_answers_list)):
+        func_description = find_description(func_descriptions, func_name_list[i])
+
+        result = simple_function_checker(
+            func_description,
+            model_output[i],
+            possible_answers_list[i],
+            language,
+            model_name,
+        )
+        if not result["valid"]:
+            return result
+
+    return {"valid": True, "error": []}
+
+
+def parallel_function_checker_no_order(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: list,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "parallel_function_checker_no_order:wrong_count",
+        }
+
+    matched_indices = []
+
+    # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer
+    # It must be this way because we need ground truth to fetch the correct function description
+    for i in range(len(possible_answers)):
+        # possible_answers[i] is a dictionary with only one key
+        func_name_expected = list(possible_answers[i].keys())[0]
+        func_description = find_description(func_descriptions, func_name_expected)
+
+        all_errors = []
+
+        for index in range(len(model_output)):
+            if index in matched_indices:
+                continue
+
+            result = simple_function_checker(
+                func_description,
+                model_output[index],
+                possible_answers[i],
+                language,
+                model_name,
+            )
+
+            if result["valid"]:
+                matched_indices.append(index)
+                break
+            else:
+                all_errors.append(
+                    {
+                        f"Model Result Index {index}": {
+                            "sub_error": result["error"],
+                            "sub_error_type": result["error_type"],
+                            "model_output_item": model_output[index],
+                            "possible_answer_item": possible_answers[i],
+                        }
+                    }
+                )
+
+        if not result["valid"]:
+            considered_indices = [i for i in range(len(model_output)) if i not in matched_indices]
+            all_errors.insert(
+                0,
+                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
+            )
+            return {
+                "valid": False,
+                "error": all_errors,
+                "error_type": "parallel_function_checker_no_order:cannot_find_match",
+            }
+
+    return {"valid": True, "error": []}
+
+
+def multiple_function_checker(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: list,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "multiple_function_checker:wrong_count",
+        }
+
+    # possible_answers is a list of only one dictionary with only one key
+    func_name_expected = list(possible_answers[0].keys())[0]
+    func_description = find_description(func_descriptions, func_name_expected)
+    return simple_function_checker(
+        func_description,
+        model_output[0],
+        possible_answers[0],
+        language,
+        model_name,
+    )
+
+
+def patten_matcher(exec_output, expected_result, function_call, is_sanity_check):
+    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+    if type(exec_output) != type(expected_result):
+        return {
+            "valid": False,
+            "error": [
+                f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}."
+            ],
+            "error_type": "executable_checker:wrong_result_type",
+            "model_executed_output": exec_output,
+        }
+    if type(exec_output) == dict:
+        # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one.
+        # This happens when the key is a timestamp or a random number.
+        if is_sanity_check:
+            if len(exec_output) != len(expected_result):
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_length",
+                    "model_executed_output": exec_output,
+                }
+            else:
+                return result
+
+        for key, value in expected_result.items():
+            if key not in exec_output:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_key_not_found",
+                    "model_executed_output": exec_output,
+                }
+        for key, value in exec_output.items():
+            if key not in expected_result:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_extra_key",
+                    "model_executed_output": exec_output,
+                }
+    if type(exec_output) == list:
+        if len(exec_output) != len(expected_result):
+            return {
+                "valid": False,
+                "error": [
+                    f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
+                ],
+                "error_type": "executable_checker:wrong_result_type:list_length",
+                "model_executed_output": exec_output,
+            }
+    return result
+
+
+#### Helper functions for Exec ####
+def executable_checker_simple(
+    function_call: str,
+    expected_result,
+    expected_result_type: str,
+    is_sanity_check=False,
+):
+    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+    exec_dict: Any = {}
+
+    try:
+        exec(
+            "from executable_python_function import *" + "\nresult=" + function_call,
+            exec_dict,
+        )
+        exec_output = exec_dict["result"]
+    except NoAPIKeyError as e:
+        raise e
+    except Exception as e:
+        result["valid"] = False
+        result["error"].append(  # type: ignore[attr-defined]
+            f"Error in execution: {repr(function_call)}. Error: {str(e)}"
+        )
+        result["error_type"] = "executable_checker:execution_error"
+        return result
+
+    # We need to special handle the case where the execution result is a tuple and convert it to a list
+    # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json
+    if isinstance(exec_output, tuple):
+        exec_output = list(exec_output)
+
+    if expected_result_type == "exact_match":
+        if exec_output != expected_result:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}."
+            )
+            result["error_type"] = "executable_checker:wrong_result"
+            result["model_executed_output"] = exec_output
+            return result
+
+    elif expected_result_type == "real_time_match":
+        # Allow for 5% difference
+        if (type(expected_result) == float or type(expected_result) == int) and (
+            type(exec_output) == float or type(exec_output) == int
+        ):
+            if not (
+                expected_result * (1 - REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
+                <= exec_output
+                <= expected_result * (1 + REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
+            ):
+                result["valid"] = False
+                result["error"].append(  # type: ignore[attr-defined]
+                    f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. {REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed."
+                )
+                result["error_type"] = "executable_checker:wrong_result_real_time"
+                result["model_executed_output"] = exec_output
+                return result
+        else:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. Type needs to be float or int for real time match criteria."
+            )
+            result["error_type"] = "executable_checker:wrong_result_real_time"
+            result["model_executed_output"] = exec_output
+            return result
+
+    else:
+        # structural match
+        pattern_match_result = patten_matcher(exec_output, expected_result, function_call, is_sanity_check)
+        if not pattern_match_result["valid"]:
+            return pattern_match_result
+
+    return result
+
+
+def executable_checker_parallel_no_order(
+    decoded_result: list, expected_exec_result: list, expected_exec_result_type: list
+):
+    if len(decoded_result) != len(expected_exec_result):
+        return {
+            "valid": False,
+            "error": [
+                f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}."
+            ],
+            "error_type": "value_error:exec_result_count",
+        }
+
+    matched_indices = []
+    for i in range(len(expected_exec_result)):
+        all_errors = []
+        for index in range(len(decoded_result)):
+            if index in matched_indices:
+                continue
+
+            result = executable_checker_simple(
+                decoded_result[index],
+                expected_exec_result[i],
+                expected_exec_result_type[i],
+                False,
+            )
+
+            if result["valid"]:
+                matched_indices.append(index)
+                break
+            else:
+                all_errors.append(
+                    {
+                        f"Model Result Index {index}": {
+                            "sub_error": result["error"],
+                            "sub_error_type": result["error_type"],
+                            "model_executed_output": (
+                                result["model_executed_output"] if "model_executed_output" in result else None
+                            ),
+                        }
+                    }
+                )
+
+        if not result["valid"]:
+            considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices]
+            all_errors.insert(
+                0,
+                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
+            )
+            return {
+                "valid": False,
+                "error": all_errors,
+                "error_type": "executable_checker:cannot_find_match",
+            }
+
+    return {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+
+#### Main function ####
+def executable_checker_rest(func_call, idx):
+    # Move this here for now to avoid needing to read this file / fix paths to be relative to dataset_dir. Fix when it's actually needed / used.
+    EVAL_GROUND_TRUTH_PATH = "/mnt/wsfuse/fair_llm_v2/datasets/eval/bfcl/rest-eval-response_v5.jsonl"  # Ground truth file for v5 for rest execution
+    with open(EVAL_GROUND_TRUTH_PATH, "r") as f:
+        EVAL_GROUND_TRUTH = f.readlines()
+    if "https://geocode.maps.co" in func_call:
+        time.sleep(2)
+    if "requests_get" in func_call:
+        func_call = func_call.replace("requests_get", "requests.get")
+    try:
+        response = eval(func_call)
+    except Exception as e:
+        return {
+            "valid": False,
+            "error": [f"Execution failed. {str(e)}"],
+            "error_type": "executable_checker_rest:execution_error",
+        }
+
+    try:
+        if response.status_code == 200:
+            eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx])
+            try:
+                if isinstance(eval_GT_json, dict):
+                    if isinstance(response.json(), dict):
+                        if set(eval_GT_json.keys()) == set(response.json().keys()):
+                            return {"valid": True, "error": [], "error_type": ""}
+                        return {
+                            "valid": False,
+                            "error": ["Key inconsistency"],
+                            "error_type": "executable_checker_rest:wrong_key",
+                        }
+                    return {
+                        "valid": False,
+                        "error": [f"Expected dictionary, but got {type(response.json())}"],
+                        "error_type": "executable_checker_rest:wrong_type",
+                    }
+
+                elif isinstance(eval_GT_json, list):
+                    if isinstance(response.json(), list):
+                        if len(eval_GT_json) != len(response.json()):
+                            return {
+                                "valid": False,
+                                "error": [f"Response list length inconsistency."],
+                                "error_type": "value_error:exec_result_rest_count",
+                            }
+
+                        else:
+                            for i in range(len(eval_GT_json)):
+                                if set(eval_GT_json[i].keys()) != set(response.json()[i].keys()):
+                                    return {
+                                        "valid": False,
+                                        "error": [f"Key inconsistency"],
+                                        "error_type": "executable_checker_rest:wrong_key",
+                                    }
+
+                            return {"valid": True, "error": []}
+                    else:
+                        return {
+                            "valid": False,
+                            "error": [f"Expected list, but got {type(response.json())}"],
+                            "error_type": "executable_checker_rest:wrong_type",
+                        }
+                return {
+                    "valid": False,
+                    "error": [f"Expected dict or list, but got {type(response.json())}"],
+                    "error_type": "executable_checker_rest:wrong_type",
+                }
+            except Exception as e:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}"
+                    ],
+                    "error_type": "executable_checker_rest:response_format_error",
+                }
+        else:
+            return {
+                "valid": False,
+                "error": [f"Execution result status code is not 200, got {response.status_code}"],
+                "error_type": "executable_checker_rest:wrong_status_code",
+            }
+    except Exception as e:
+        return {
+            "valid": False,
+            "error": [f"Cannot get status code of the response. Error: {str(e)}"],
+            "error_type": "executable_checker_rest:cannot_get_status_code",
+        }
+
+
+def ast_checker(func_description, model_output, possible_answer, language, test_category, model_name):
+    if "parallel" in test_category:
+        return parallel_function_checker_no_order(func_description, model_output, possible_answer, language, model_name)
+
+    elif "multiple" in test_category:
+        return multiple_function_checker(func_description, model_output, possible_answer, language, model_name)
+
+    else:
+        if len(model_output) != 1:
+            return {
+                "valid": False,
+                "error": ["Wrong number of functions."],
+                "error_type": "simple_function_checker:wrong_count",
+            }
+
+        return simple_function_checker(
+            func_description[0],
+            model_output[0],
+            possible_answer[0],
+            language,
+            model_name,
+        )
+
+
+def exec_checker(decoded_result: list, func_description: dict, test_category: str):
+    if "multiple" in test_category or "parallel" in test_category:
+        return executable_checker_parallel_no_order(
+            decoded_result,
+            func_description["execution_result"],
+            func_description["execution_result_type"],
+        )
+
+    else:
+        if len(decoded_result) != 1:
+            return {
+                "valid": False,
+                "error": ["Wrong number of functions."],
+                "error_type": "simple_exec_checker:wrong_count",
+            }
+        return executable_checker_simple(
+            decoded_result[0],
+            func_description["execution_result"][0],
+            func_description["execution_result_type"][0],
+            False,
+        )
+
+
+def is_empty_output(decoded_output):
+    # This function is a patch to the ast decoder for relevance detection
+    # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call
+    # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct)
+    if not is_function_calling_format_output(decoded_output):
+        return True
+    if len(decoded_output) == 0:
+        return True
+    if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
+        return True
+
+
+def is_function_calling_format_output(decoded_output):
+    # Ensure the output is a list of dictionaries
+    if type(decoded_output) == list:
+        for item in decoded_output:
+            if type(item) != dict:
+                return False
+        return True
+    return False
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Tree-sitter changes its API with unfortunate frequency. Modules that need it should
+import it from here so that we can centrally manage things as necessary.
+"""
+
+# These currently work with tree-sitter 0.23.0
+# NOTE: Don't import tree-sitter or any of the language modules in the main module
+# because not all environments have them. Import lazily inside functions where needed.
+
+import importlib
+import typing
+
+if typing.TYPE_CHECKING:
+    import tree_sitter
+
+
+def get_language(language: str) -> "tree_sitter.Language":
+    import tree_sitter
+
+    language_module_name = f"tree_sitter_{language}"
+    try:
+        language_module = importlib.import_module(language_module_name)
+    except ModuleNotFoundError as exc:
+        raise ValueError(
+            f"Language {language} is not found. Please install the tree-sitter-{language} package."
+        ) from exc
+    return tree_sitter.Language(language_module.language())
+
+
+def get_parser(language: str, **kwargs) -> "tree_sitter.Parser":
+    import tree_sitter
+
+    lang = get_language(language)
+    return tree_sitter.Parser(lang, **kwargs)
--- a/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
--- a/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
@ -0,0 +1,330 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import re
+from typing import Sequence
+
+from llama_stack.providers.utils.scoring.basic_scoring_utils import time_limit
+
+# from minerva
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "ft",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+
+
+def try_evaluate_frac(expression: str, fmt: str = "0.2e") -> str:
+    if isinstance(expression, float):
+        return expression
+    new_expression = f"{expression}"
+    regex = re.compile(r"\\frac{([^}]+)}{([^}]+)}")
+    for match in re.finditer(regex, expression):
+        try:
+            value = float(match.group(1)) / float(match.group(2))
+            new_expression = new_expression.replace(
+                match.group(),
+                f"{{value:{fmt}}}".format(value=value),
+                1,
+            )
+        except Exception:
+            continue
+    return new_expression
+
+
+def try_evaluate_latex(expression: str, fmt: str = ".2e") -> str:
+    try:
+        with time_limit(seconds=5):
+            from sympy.parsing.latex import parse_latex
+
+            value = parse_latex(expression).evalf()  # type: ignore
+            return f"{{value:{fmt}}}".format(value=value)
+    except Exception:
+        return expression
+
+
+def first_answer(text: str, markers: Sequence[str] = ("Q:", "A:")) -> str:
+    for marker in markers:
+        text = text.split(marker)[0]
+    return text
+
+
+def extract_result_from_boxed(answer: str) -> str:
+    box_start = "\\boxed"
+    # format is `\\boxed <value>$` or `\\boxed{<value>}`, with potential white spaces framing `<value>`
+    start = answer.rfind(box_start)
+    if start < 0:
+        return ""
+    answer = answer[start + len(box_start) :].strip()
+    ends_with_curly = answer.startswith("{")
+    i = 0
+    open_braces = 0
+    while i < len(answer):
+        if answer[i] == "{":
+            open_braces += 1
+        elif answer[i] == "}":
+            open_braces -= 1
+        if open_braces == 0:
+            if ends_with_curly:
+                answer = answer[: i + 1].strip()
+                break
+            elif answer[i] == "$":
+                answer = answer[:i].strip()
+                break
+        i += 1
+    else:
+        return ""
+    # remove extra curly braces
+    while True:
+        if answer.startswith("{") and answer.endswith("}"):
+            answer = answer[1:-1].strip()
+        else:
+            break
+    return answer
+
+
+# from minerva paper + _normalise_result from xavierm
+def normalize_final_answer(final_answer: str, regex_pattern: str, match_first: bool = True) -> str:
+    """Extract and normalize a final answer to a quantitative reasoning question."""
+    match = re.findall(regex_pattern, final_answer)
+    extraction: str
+    if len(match) > 0:
+        if match_first:
+            extraction = match[0]
+        else:
+            extraction = match[-1]
+    else:
+        extraction = extract_result_from_boxed(final_answer)
+
+    if len(extraction) == 0:
+        return final_answer
+    else:
+        final_answer = extraction
+    final_answer = final_answer.split("=")[-1]
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+    # Extract answer that is in LaTeX math, is bold,
+    # is surrounded by a box, etc.
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+    # Normalize shorthand TeX:
+    # \fracab -> \frac{a}{b}
+    # \frac{abc}{bef} -> \frac{abc}{bef}
+    # \fracabc -> \frac{a}{b}c
+    # \sqrta -> \sqrt{a}
+    # \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+    # Normalize 100,000 -> 100000
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+    # If the final answer is a single letter in parentheses, remove the parentheses
+    # Example: (a) -> a (but not (ab) -> ab)
+    if re.match(r"\([a-zA-Z]\)", final_answer):
+        final_answer = final_answer[1]
+    return _normalise_result(final_answer)
+
+
+def _normalise_result(string: str) -> str:
+    # linebreaks
+    string = string.replace("\n", "")
+
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("cfrac", "frac")
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\le", "")
+    string = string.replace("\\right", "")
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+
+    # remove units (on the right)
+    string = _remove_right_units(string)
+
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace(r"\%", "")
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    string = string.split("=")[-1]
+
+    # fix sqrt3 --> sqrt{3}
+    string = _fix_sqrt(string)
+
+    # remove spaces
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+
+    return string
+
+
+def _remove_right_units(string: str) -> str:
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    try:
+        if "\\text{ " in string:
+            splits = string.split("\\text{ ")
+            assert len(splits) == 2
+            return splits[0]
+        else:
+            return string
+    except AssertionError:
+        return string
+
+
+def _fix_sqrt(string: str) -> str:
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if len(split) == 0:
+            return string
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+
+
+def _fix_fracs(string: str) -> str:
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if len(substr) == 0:
+                return string
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except AssertionError:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+
+
+def _fix_a_slash_b(string: str) -> str:
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        ia = int(a)
+        ib = int(b)
+        assert string == "{}/{}".format(ia, ib)
+        new_string = "\\frac{" + str(ia) + "}{" + str(ib) + "}"
+        return new_string
+    except (ValueError, AssertionError):
+        return string
--- a/llama_stack/providers/inline/scoring/braintrust/init.py
+++ b/llama_stack/providers/inline/scoring/braintrust/init.py
@ -3,11 +3,11 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict

 from pydantic import BaseModel

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import BraintrustScoringConfig

@ -18,7 +18,7 @@ class BraintrustProviderDataValidator(BaseModel):

 async def get_provider_impl(
    config: BraintrustScoringConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
    from .braintrust import BraintrustScoringImpl

--- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
@ -133,7 +133,7 @@ class BraintrustScoringImpl(
    async def shutdown(self) -> None: ...

    async def list_scoring_functions(self) -> List[ScoringFn]:
-        scoring_fn_defs_list = [x for x in self.supported_fn_defs_registry.values()]
+        scoring_fn_defs_list = list(self.supported_fn_defs_registry.values())
        for f in scoring_fn_defs_list:
            assert f.identifier.startswith("braintrust"), (
                "All braintrust scoring fn must have identifier prefixed with 'braintrust'! "
@ -167,11 +167,11 @@ class BraintrustScoringImpl(
        dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
        validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.scoring.value))

-        all_rows = await self.datasetio_api.get_rows_paginated(
+        all_rows = await self.datasetio_api.iterrows(
            dataset_id=dataset_id,
-            rows_in_page=-1,
+            limit=-1,
        )
-        res = await self.score(input_rows=all_rows.rows, scoring_functions=scoring_functions)
+        res = await self.score(input_rows=all_rows.data, scoring_functions=scoring_functions)
        if save_results_dataset:
            # TODO: persist and register dataset on to server for reading
            # self.datasets_api.register_dataset()
--- a/llama_stack/providers/inline/scoring/llm_as_judge/init.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/init.py
@ -3,16 +3,16 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict

-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api

 from .config import LlmAsJudgeScoringConfig


 async def get_provider_impl(
    config: LlmAsJudgeScoringConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
    from .scoring import LlmAsJudgeScoringImpl

--- a/llama_stack/providers/inline/scoring/llm_as_judge/config.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/config.py
@ -3,7 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel


-class LlmAsJudgeScoringConfig(BaseModel): ...
+class LlmAsJudgeScoringConfig(BaseModel):
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@ -25,7 +25,7 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 from .config import LlmAsJudgeScoringConfig
 from .scoring_fn.llm_as_judge_scoring_fn import LlmAsJudgeScoringFn

-LLM_JUDGE_FNS = [LlmAsJudgeScoringFn]
+LLM_JUDGE_FN = LlmAsJudgeScoringFn


 class LlmAsJudgeScoringImpl(
@ -43,23 +43,17 @@ class LlmAsJudgeScoringImpl(
        self.datasetio_api = datasetio_api
        self.datasets_api = datasets_api
        self.inference_api = inference_api
-        self.scoring_fn_id_impls = {}

    async def initialize(self) -> None:
-        for fn in LLM_JUDGE_FNS:
-            impl = fn(inference_api=self.inference_api)
-            for fn_defs in impl.get_supported_scoring_fn_defs():
-                self.scoring_fn_id_impls[fn_defs.identifier] = impl
-                self.llm_as_judge_fn = impl
+        impl = LLM_JUDGE_FN(inference_api=self.inference_api)
+        self.llm_as_judge_fn = impl

    async def shutdown(self) -> None: ...

    async def list_scoring_functions(self) -> List[ScoringFn]:
-        scoring_fn_defs_list = [
-            fn_def for impl in self.scoring_fn_id_impls.values() for fn_def in impl.get_supported_scoring_fn_defs()
-        ]
+        scoring_fn_defs_list = self.llm_as_judge_fn.get_supported_scoring_fn_defs()

-        for f in scoring_fn_defs_list:
+        for f in self.llm_as_judge_fn.get_supported_scoring_fn_defs():
            assert f.identifier.startswith("llm-as-judge"), (
                "All llm-as-judge scoring fn must have identifier prefixed with 'llm-as-judge'! "
            )
@ -67,7 +61,7 @@ class LlmAsJudgeScoringImpl(
        return scoring_fn_defs_list

    async def register_scoring_function(self, function_def: ScoringFn) -> None:
-        raise NotImplementedError("Register scoring function not implemented yet")
+        self.llm_as_judge_fn.register_scoring_fn_def(function_def)

    async def score_batch(
        self,
@ -78,12 +72,12 @@ class LlmAsJudgeScoringImpl(
        dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
        validate_dataset_schema(dataset_def.dataset_schema, get_valid_schemas(Api.scoring.value))

-        all_rows = await self.datasetio_api.get_rows_paginated(
+        all_rows = await self.datasetio_api.iterrows(
            dataset_id=dataset_id,
-            rows_in_page=-1,
+            limit=-1,
        )
        res = await self.score(
-            input_rows=all_rows.rows,
+            input_rows=all_rows.data,
            scoring_functions=scoring_functions,
        )
        if save_results_dataset:
@ -102,9 +96,7 @@ class LlmAsJudgeScoringImpl(
    ) -> ScoreResponse:
        res = {}
        for scoring_fn_id in scoring_functions.keys():
-            if scoring_fn_id not in self.scoring_fn_id_impls:
-                raise ValueError(f"Scoring function {scoring_fn_id} is not supported.")
-            scoring_fn = self.scoring_fn_id_impls[scoring_fn_id]
+            scoring_fn = self.llm_as_judge_fn
            scoring_fn_params = scoring_functions.get(scoring_fn_id, None)
            score_results = await scoring_fn.score(input_rows, scoring_fn_id, scoring_fn_params)
            agg_results = await scoring_fn.aggregate(score_results, scoring_fn_id, scoring_fn_params)
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
@ -6,7 +6,7 @@
 import re
 from typing import Any, Dict, Optional

-from llama_stack.apis.inference.inference import Inference
+from llama_stack.apis.inference.inference import Inference, UserMessage
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
@ -58,10 +58,9 @@ class LlmAsJudgeScoringFn(RegisteredBaseScoringFn):
        judge_response = await self.inference_api.chat_completion(
            model_id=fn_def.params.judge_model,
            messages=[
-                {
-                    "role": "user",
-                    "content": judge_input_msg,
-                }
+                UserMessage(
+                    content=judge_input_msg,
+                ),
            ],
        )
        content = judge_response.completion_message.content
--- a/llama_stack/providers/inline/telemetry/meta_reference/config.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py
@ -44,9 +44,9 @@ class TelemetryConfig(BaseModel):
        return v

    @classmethod
-    def sample_run_config(cls, __distro_dir__: str = "runtime", db_name: str = "trace_store.db") -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]:
        return {
            "service_name": "${env.OTEL_SERVICE_NAME:llama-stack}",
            "sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
-            "sqlite_db_path": "${env.SQLITE_DB_PATH:~/.llama/" + __distro_dir__ + "/" + db_name + "}",
+            "sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}",
        }
--- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import json
-from datetime import datetime
+from datetime import datetime, timezone

 from opentelemetry.sdk.trace import ReadableSpan
 from opentelemetry.sdk.trace.export import SpanProcessor
@ -34,7 +34,7 @@ class ConsoleSpanProcessor(SpanProcessor):
        if span.attributes and span.attributes.get("__autotraced__"):
            return

-        timestamp = datetime.utcfromtimestamp(span.start_time / 1e9).strftime("%H:%M:%S.%f")[:-3]
+        timestamp = datetime.fromtimestamp(span.start_time / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]

        print(
            f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
@ -46,7 +46,7 @@ class ConsoleSpanProcessor(SpanProcessor):
        if span.attributes and span.attributes.get("__autotraced__"):
            return

-        timestamp = datetime.utcfromtimestamp(span.end_time / 1e9).strftime("%H:%M:%S.%f")[:-3]
+        timestamp = datetime.fromtimestamp(span.end_time / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]

        span_context = (
            f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
@ -74,7 +74,7 @@ class ConsoleSpanProcessor(SpanProcessor):
                print(f"    {COLORS['dim']}{key}: {str_value}{COLORS['reset']}")

        for event in span.events:
-            event_time = datetime.utcfromtimestamp(event.timestamp / 1e9).strftime("%H:%M:%S.%f")[:-3]
+            event_time = datetime.fromtimestamp(event.timestamp / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]

            severity = event.attributes.get("severity", "info")
            message = event.attributes.get("message", event.name)
--- a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
@ -7,7 +7,8 @@
 import json
 import os
 import sqlite3
-from datetime import datetime
+import threading
+from datetime import datetime, timezone

 from opentelemetry.sdk.trace import SpanProcessor
 from opentelemetry.trace import Span
@ -17,14 +18,18 @@ class SQLiteSpanProcessor(SpanProcessor):
    def __init__(self, conn_string):
        """Initialize the SQLite span processor with a connection string."""
        self.conn_string = conn_string
-        self.conn = None
+        self._local = threading.local()  # Thread-local storage for connections
        self.setup_database()

-    def _get_connection(self) -> sqlite3.Connection:
-        """Get the database connection."""
-        if self.conn is None:
-            self.conn = sqlite3.connect(self.conn_string, check_same_thread=False)
-        return self.conn
+    def _get_connection(self):
+        """Get a thread-local database connection."""
+        if not hasattr(self._local, "conn"):
+            try:
+                self._local.conn = sqlite3.connect(self.conn_string)
+            except Exception as e:
+                print(f"Error connecting to SQLite database: {e}")
+                raise
+        return self._local.conn

    def setup_database(self):
        """Create the necessary tables if they don't exist."""
@ -119,8 +124,8 @@ class SQLiteSpanProcessor(SpanProcessor):
                    trace_id,
                    service_name,
                    (span_id if not parent_span_id else None),
-                    datetime.fromtimestamp(span.start_time / 1e9).isoformat(),
-                    datetime.fromtimestamp(span.end_time / 1e9).isoformat(),
+                    datetime.fromtimestamp(span.start_time / 1e9, timezone.utc).isoformat(),
+                    datetime.fromtimestamp(span.end_time / 1e9, timezone.utc).isoformat(),
                ),
            )

@ -138,8 +143,8 @@ class SQLiteSpanProcessor(SpanProcessor):
                    trace_id,
                    parent_span_id,
                    span.name,
-                    datetime.fromtimestamp(span.start_time / 1e9).isoformat(),
-                    datetime.fromtimestamp(span.end_time / 1e9).isoformat(),
+                    datetime.fromtimestamp(span.start_time / 1e9, timezone.utc).isoformat(),
+                    datetime.fromtimestamp(span.end_time / 1e9, timezone.utc).isoformat(),
                    json.dumps(dict(span.attributes)),
                    span.status.status_code.name,
                    span.kind.name,
@ -156,7 +161,7 @@ class SQLiteSpanProcessor(SpanProcessor):
                    (
                        span_id,
                        event.name,
-                        datetime.fromtimestamp(event.timestamp / 1e9).isoformat(),
+                        datetime.fromtimestamp(event.timestamp / 1e9, timezone.utc).isoformat(),
                        json.dumps(dict(event.attributes)),
                    ),
                )
@ -168,9 +173,14 @@ class SQLiteSpanProcessor(SpanProcessor):

    def shutdown(self):
        """Cleanup any resources."""
-        if self.conn:
-            self.conn.close()
-            self.conn = None
+        # We can't access other threads' connections, so we just close our own
+        if hasattr(self._local, "conn"):
+            try:
+                self._local.conn.close()
+            except Exception as e:
+                print(f"Error closing SQLite connection: {e}")
+            finally:
+                del self._local.conn

    def force_flush(self, timeout_millis=30000):
        """Force export of spans."""
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@ -73,6 +73,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
    def __init__(self, config: TelemetryConfig, deps: Dict[str, Any]) -> None:
        self.config = config
        self.datasetio_api = deps.get(Api.datasetio)
+        self.meter = None

        resource = Resource.create(
            {
@ -171,6 +172,8 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
        return _GLOBAL_STORAGE["gauges"][name]

    def _log_metric(self, event: MetricEvent) -> None:
+        if self.meter is None:
+            return
        if isinstance(event.value, int):
            counter = self._get_or_create_counter(event.metric, event.unit)
            counter.add(event.value, attributes=event.attributes)
--- a/llama_stack/providers/inline/telemetry/sample/init.py
+++ b/llama_stack/providers/inline/telemetry/sample/init.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import SampleConfig
-
-
-async def get_adapter_impl(config: SampleConfig, _deps) -> Any:
-    from .sample import SampleTelemetryImpl
-
-    impl = SampleTelemetryImpl(config)
-    await impl.initialize()
-    return impl
--- a/llama_stack/providers/inline/telemetry/sample/config.py
+++ b/llama_stack/providers/inline/telemetry/sample/config.py
@ -1,12 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pydantic import BaseModel
-
-
-class SampleConfig(BaseModel):
-    host: str = "localhost"
-    port: int = 9999
--- a/llama_stack/providers/inline/telemetry/sample/sample.py
+++ b/llama_stack/providers/inline/telemetry/sample/sample.py
@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.telemetry import Telemetry
-
-from .config import SampleConfig
-
-
-class SampleTelemetryImpl(Telemetry):
-    def __init__(self, config: SampleConfig):
-        self.config = config
-
-    async def initialize(self):
-        pass
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/init.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/init.py
@ -4,13 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from .code_interpreter import CodeInterpreterToolRuntimeImpl
+from typing import Any, Dict
+
 from .config import CodeInterpreterToolConfig

 __all__ = ["CodeInterpreterToolConfig", "CodeInterpreterToolRuntimeImpl"]


-async def get_provider_impl(config: CodeInterpreterToolConfig, _deps):
+async def get_provider_impl(config: CodeInterpreterToolConfig, _deps: Dict[str, Any]):
+    from .code_interpreter import CodeInterpreterToolRuntimeImpl
+
    impl = CodeInterpreterToolRuntimeImpl(config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
@ -76,6 +76,7 @@ class CodeExecutionRequest:
    only_last_cell_fail: bool = True
    seed: int = 0
    strip_fpaths_in_stderr: bool = True
+    use_bwrap: bool = True


 class CodeExecutor:
@ -103,8 +104,6 @@ _set_seeds()\

        script = "\n\n".join([seeds_prefix] + [CODE_ENV_PREFIX] + scripts)
        with tempfile.TemporaryDirectory() as dpath:
-            bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath])
-            cmd = [*bwrap_prefix.split(), sys.executable, "-c", script]
            code_fpath = os.path.join(dpath, "code.py")
            with open(code_fpath, "w") as f:
                f.write(script)
@ -118,6 +117,13 @@ _set_seeds()\
                    MPLBACKEND="module://matplotlib_custom_backend",
                    PYTHONPATH=f"{DIRNAME}:{python_path}",
                )
+
+                if req.use_bwrap:
+                    bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath])
+                    cmd = [*bwrap_prefix.split(), sys.executable, "-c", script]
+                else:
+                    cmd = [sys.executable, "-c", script]
+
                stdout, stderr, returncode = do_subprocess(
                    cmd=cmd,
                    env=env,
@ -162,7 +168,7 @@ def process_matplotlib_response(response, matplotlib_dump_dir: str):
    image_paths = []
    for i, img in enumerate(images):
        # create new directory for each day to better organize data:
-        dump_dname = datetime.today().strftime("%Y-%m-%d")
+        dump_dname = datetime.today().strftime("%Y-%m-%d")  # noqa: DTZ002 - we don't care about timezones here since we are displaying the date
        dump_dpath = Path(matplotlib_dump_dir, dump_dname)
        dump_dpath.mkdir(parents=True, exist_ok=True)
        # save image into a file
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
@ -5,7 +5,9 @@
 # the root directory of this source tree.


+import asyncio
 import logging
+import os
 import tempfile
 from typing import Any, Dict, List, Optional

@ -36,7 +38,7 @@ class CodeInterpreterToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):
    async def initialize(self):
        pass

-    async def register_tool(self, tool: Tool):
+    async def register_tool(self, tool: Tool) -> None:
        pass

    async def unregister_tool(self, tool_id: str) -> None:
@ -61,8 +63,10 @@ class CodeInterpreterToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):

    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
        script = kwargs["code"]
-        req = CodeExecutionRequest(scripts=[script])
-        res = self.code_executor.execute(req)
+        # Use environment variable to control bwrap usage
+        force_disable_bwrap = os.environ.get("DISABLE_CODE_SANDBOX", "").lower() in ("1", "true", "yes")
+        req = CodeExecutionRequest(scripts=[script], use_bwrap=not force_disable_bwrap)
+        res = await asyncio.to_thread(self.code_executor.execute, req)
        pieces = [res["process_status"]]
        for out_type in ["stdout", "stderr"]:
            res_out = res[out_type]
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from pydantic import BaseModel


 class CodeInterpreterToolConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/matplotlib_custom_backend.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/matplotlib_custom_backend.py
@ -73,7 +73,10 @@ def show():
        image_data.append({"image_base64": image_base64})
        buf.close()

-    req_con, resp_con = _open_connections()
+    # The _open_connections method is dynamically made available to
+    # the interpreter by bundling code from "code_env_prefix.py" -- by literally prefixing it -- and
+    # then "eval"ing it within a sandboxed interpreter.
+    req_con, resp_con = _open_connections()  # noqa: F821

    _json_dump = _json.dumps(
        {
--- a/llama_stack/providers/inline/tool_runtime/rag/config.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/config.py
@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, Dict
+
 from pydantic import BaseModel


 class RagToolRuntimeConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -10,6 +10,8 @@ import secrets
 import string
 from typing import Any, Dict, List, Optional

+from pydantic import TypeAdapter
+
 from llama_stack.apis.common.content_types import (
    URL,
    InterleavedContent,
@ -23,6 +25,7 @@ from llama_stack.apis.tools import (
    RAGToolRuntime,
    ToolDef,
    ToolInvocationResult,
+    ToolParameter,
    ToolRuntime,
 )
 from llama_stack.apis.vector_io import QueryChunksResponse, VectorIO
@ -120,9 +123,14 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
        # sort by score
        chunks, scores = zip(*sorted(zip(chunks, scores, strict=False), key=lambda x: x[1], reverse=True), strict=False)
        chunks = chunks[: query_config.max_chunks]
+
        tokens = 0
-        picked = []
-        for c in chunks:
+        picked = [
+            TextContentItem(
+                text=f"knowledge_search tool found {len(chunks)} chunks:\nBEGIN of knowledge_search tool results.\n"
+            )
+        ]
+        for i, c in enumerate(chunks):
            metadata = c.metadata
            tokens += metadata["token_count"]
            if tokens > query_config.max_tokens_in_context:
@ -132,20 +140,13 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
                break
            picked.append(
                TextContentItem(
-                    text=f"id:{metadata['document_id']}; content:{c.content}",
+                    text=f"Result {i + 1}:\nDocument_id:{metadata['document_id'][:5]}\nContent: {c.content}\n",
                )
            )
+        picked.append(TextContentItem(text="END of knowledge_search tool results.\n"))

        return RAGQueryResult(
-            content=[
-                TextContentItem(
-                    text="Here are the retrieved documents for relevant context:\n=== START-RETRIEVED-CONTEXT ===\n",
-                ),
-                *picked,
-                TextContentItem(
-                    text="\n=== END-RETRIEVED-CONTEXT ===\n",
-                ),
-            ],
+            content=picked,
            metadata={
                "document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
            },
@ -158,17 +159,40 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime):
        # by the LLM. The method is only implemented so things like /tools can list without
        # encountering fatals.
        return [
-            ToolDef(
-                name="query_from_memory",
-                description="Retrieve context from memory",
-            ),
            ToolDef(
                name="insert_into_memory",
                description="Insert documents into memory",
            ),
+            ToolDef(
+                name="knowledge_search",
+                description="Search for information in a database.",
+                parameters=[
+                    ToolParameter(
+                        name="query",
+                        description="The query to search for. Can be a natural language sentence or keywords.",
+                        parameter_type="string",
+                    ),
+                ],
+            ),
        ]

    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
-        raise RuntimeError(
-            "This toolgroup should not be called generically but only through specific methods of the RAGToolRuntime protocol"
+        vector_db_ids = kwargs.get("vector_db_ids", [])
+        query_config = kwargs.get("query_config")
+        if query_config:
+            query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config)
+        else:
+            # handle someone passing an empty dict
+            query_config = RAGQueryConfig()
+
+        query = kwargs["query"]
+        result = await self.query(
+            content=query,
+            vector_db_ids=vector_db_ids,
+            query_config=query_config,
+        )
+
+        return ToolInvocationResult(
+            content=result.content,
+            metadata=result.metadata,
        )
--- a/llama_stack/providers/inline/vector_io/chroma/init.py
+++ b/llama_stack/providers/inline/vector_io/chroma/init.py
@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
+from typing import Any, Dict

-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.datatypes import Api

 from .config import ChromaVectorIOConfig


-async def get_provider_impl(config: ChromaVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: ChromaVectorIOConfig, deps: Dict[Api, Any]):
    from llama_stack.providers.remote.vector_io.chroma.chroma import (
        ChromaVectorIOAdapter,
    )
--- a/llama_stack/providers/inline/vector_io/chroma/config.py
+++ b/llama_stack/providers/inline/vector_io/chroma/config.py
@ -13,5 +13,5 @@ class ChromaVectorIOConfig(BaseModel):
    db_path: str

    @classmethod
-    def sample_config(cls) -> Dict[str, Any]:
-        return {"db_path": "{env.CHROMADB_PATH}"}
+    def sample_run_config(cls, db_path: str = "${env.CHROMADB_PATH}", **kwargs: Any) -> Dict[str, Any]:
+        return {"db_path": db_path}
--- a/llama_stack/providers/inline/vector_io/faiss/init.py
+++ b/llama_stack/providers/inline/vector_io/faiss/init.py
@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
+from typing import Any, Dict

-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.datatypes import Api

 from .config import FaissVectorIOConfig


-async def get_provider_impl(config: FaissVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: FaissVectorIOConfig, deps: Dict[Api, Any]):
    from .faiss import FaissVectorIOAdapter

    assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}"
--- a/llama_stack/providers/inline/vector_io/faiss/config.py
+++ b/llama_stack/providers/inline/vector_io/faiss/config.py
@ -20,7 +20,7 @@ class FaissVectorIOConfig(BaseModel):
    kvstore: KVStoreConfig

    @classmethod
-    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
        return {
            "kvstore": SqliteKVStoreConfig.sample_run_config(
                __distro_dir__=__distro_dir__,
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import asyncio
 import base64
 import io
 import json
@ -99,7 +100,7 @@ class FaissIndex(EmbeddingIndex):
        await self._save_index()

    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
-        distances, indices = self.index.search(embedding.reshape(1, -1).astype(np.float32), k)
+        distances, indices = await asyncio.to_thread(self.index.search, embedding.reshape(1, -1).astype(np.float32), k)

        chunks = []
        scores = []
--- a/llama_stack/providers/inline/vector_io/milvus/init.py
+++ b/llama_stack/providers/inline/vector_io/milvus/init.py
@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict
+
+from llama_stack.providers.datatypes import Api
+
+from .config import MilvusVectorIOConfig
+
+
+async def get_provider_impl(config: MilvusVectorIOConfig, deps: Dict[Api, Any]):
+    from llama_stack.providers.remote.vector_io.milvus.milvus import MilvusVectorIOAdapter
+
+    impl = MilvusVectorIOAdapter(config, deps[Api.inference])
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/vector_io/milvus/config.py
+++ b/llama_stack/providers/inline/vector_io/milvus/config.py
@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict
+
+from pydantic import BaseModel
+
+from llama_stack.schema_utils import json_schema_type
+
+
+@json_schema_type
+class MilvusVectorIOConfig(BaseModel):
+    db_path: str
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {"db_path": "${env.MILVUS_DB_PATH}"}
--- a/Show more
+++ b/Show more
				`@ -1 +0,0 @@`
				`Subproject commit 9b6d4b4a7b9b8f811bb6b269b0c2ce254e3a0c1b`