impls -> inline, adapters -> remote (#381)

2024-11-06 14:54:05 -08:00 · 2024-11-06 14:54:05 -08:00 · 994732e2e0
commit 994732e2e0
parent b10e9f46bb
169 changed files with 106 additions and 105 deletions
--- a/llama_stack/providers/inline/meta_reference/init.py
+++ b/llama_stack/providers/inline/meta_reference/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/inline/meta_reference/agents/init.py
+++ b/llama_stack/providers/inline/meta_reference/agents/init.py
@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Dict
+
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
+from .config import MetaReferenceAgentsImplConfig
+
+
+async def get_provider_impl(
+    config: MetaReferenceAgentsImplConfig, deps: Dict[Api, ProviderSpec]
+):
+    from .agents import MetaReferenceAgentsImpl
+
+    impl = MetaReferenceAgentsImpl(
+        config,
+        deps[Api.inference],
+        deps[Api.memory],
+        deps[Api.safety],
+        deps[Api.memory_banks],
+    )
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/meta_reference/agents/agent_instance.py
+++ b/llama_stack/providers/inline/meta_reference/agents/agent_instance.py
@ -0,0 +1,844 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import copy
+import os
+import re
+import secrets
+import shutil
+import string
+import tempfile
+import uuid
+from datetime import datetime
+from typing import AsyncGenerator, List, Tuple
+from urllib.parse import urlparse
+
+import httpx
+
+from termcolor import cprint
+
+from llama_stack.apis.agents import *  # noqa: F403
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.memory import *  # noqa: F403
+from llama_stack.apis.memory_banks import *  # noqa: F403
+from llama_stack.apis.safety import *  # noqa: F403
+
+from llama_stack.providers.utils.kvstore import KVStore
+from llama_stack.providers.utils.telemetry import tracing
+
+from .persistence import AgentPersistence
+from .rag.context_retriever import generate_rag_query
+from .safety import SafetyException, ShieldRunnerMixin
+from .tools.base import BaseTool
+from .tools.builtin import (
+    CodeInterpreterTool,
+    interpret_content_as_attachment,
+    PhotogenTool,
+    SearchTool,
+    WolframAlphaTool,
+)
+from .tools.safety import SafeTool
+
+
+def make_random_string(length: int = 8):
+    return "".join(
+        secrets.choice(string.ascii_letters + string.digits) for _ in range(length)
+    )
+
+
+class ChatAgent(ShieldRunnerMixin):
+    def __init__(
+        self,
+        agent_id: str,
+        agent_config: AgentConfig,
+        inference_api: Inference,
+        memory_api: Memory,
+        memory_banks_api: MemoryBanks,
+        safety_api: Safety,
+        persistence_store: KVStore,
+    ):
+        self.agent_id = agent_id
+        self.agent_config = agent_config
+        self.inference_api = inference_api
+        self.memory_api = memory_api
+        self.memory_banks_api = memory_banks_api
+        self.safety_api = safety_api
+        self.storage = AgentPersistence(agent_id, persistence_store)
+
+        self.tempdir = tempfile.mkdtemp()
+
+        builtin_tools = []
+        for tool_defn in agent_config.tools:
+            if isinstance(tool_defn, WolframAlphaToolDefinition):
+                tool = WolframAlphaTool(tool_defn.api_key)
+            elif isinstance(tool_defn, SearchToolDefinition):
+                tool = SearchTool(tool_defn.engine, tool_defn.api_key)
+            elif isinstance(tool_defn, CodeInterpreterToolDefinition):
+                tool = CodeInterpreterTool()
+            elif isinstance(tool_defn, PhotogenToolDefinition):
+                tool = PhotogenTool(dump_dir=self.tempdir)
+            else:
+                continue
+
+            builtin_tools.append(
+                SafeTool(
+                    tool,
+                    safety_api,
+                    tool_defn.input_shields,
+                    tool_defn.output_shields,
+                )
+            )
+        self.tools_dict = {t.get_name(): t for t in builtin_tools}
+
+        ShieldRunnerMixin.__init__(
+            self,
+            safety_api,
+            input_shields=agent_config.input_shields,
+            output_shields=agent_config.output_shields,
+        )
+
+    def __del__(self):
+        shutil.rmtree(self.tempdir)
+
+    def turn_to_messages(self, turn: Turn) -> List[Message]:
+        messages = []
+
+        # We do not want to keep adding RAG context to the input messages
+        # May be this should be a parameter of the agentic instance
+        # that can define its behavior in a custom way
+        for m in turn.input_messages:
+            msg = m.copy()
+            if isinstance(msg, UserMessage):
+                msg.context = None
+            messages.append(msg)
+
+        for step in turn.steps:
+            if step.step_type == StepType.inference.value:
+                messages.append(step.model_response)
+            elif step.step_type == StepType.tool_execution.value:
+                for response in step.tool_responses:
+                    messages.append(
+                        ToolResponseMessage(
+                            call_id=response.call_id,
+                            tool_name=response.tool_name,
+                            content=response.content,
+                        )
+                    )
+            elif step.step_type == StepType.shield_call.value:
+                if step.violation:
+                    # CompletionMessage itself in the ShieldResponse
+                    messages.append(
+                        CompletionMessage(
+                            content=step.violation.user_message,
+                            stop_reason=StopReason.end_of_turn,
+                        )
+                    )
+        # print_dialog(messages)
+        return messages
+
+    async def create_session(self, name: str) -> str:
+        return await self.storage.create_session(name)
+
+    @tracing.span("create_and_execute_turn")
+    async def create_and_execute_turn(
+        self, request: AgentTurnCreateRequest
+    ) -> AsyncGenerator:
+        assert request.stream is True, "Non-streaming not supported"
+
+        session_info = await self.storage.get_session_info(request.session_id)
+        if session_info is None:
+            raise ValueError(f"Session {request.session_id} not found")
+
+        turns = await self.storage.get_session_turns(request.session_id)
+
+        messages = []
+        if len(turns) == 0 and self.agent_config.instructions != "":
+            messages.append(SystemMessage(content=self.agent_config.instructions))
+
+        for i, turn in enumerate(turns):
+            messages.extend(self.turn_to_messages(turn))
+
+        messages.extend(request.messages)
+
+        turn_id = str(uuid.uuid4())
+        start_time = datetime.now()
+        yield AgentTurnResponseStreamChunk(
+            event=AgentTurnResponseEvent(
+                payload=AgentTurnResponseTurnStartPayload(
+                    turn_id=turn_id,
+                )
+            )
+        )
+
+        steps = []
+        output_message = None
+        async for chunk in self.run(
+            session_id=request.session_id,
+            turn_id=turn_id,
+            input_messages=messages,
+            attachments=request.attachments or [],
+            sampling_params=self.agent_config.sampling_params,
+            stream=request.stream,
+        ):
+            if isinstance(chunk, CompletionMessage):
+                cprint(
+                    f"{chunk.role.capitalize()}: {chunk.content}",
+                    "white",
+                    attrs=["bold"],
+                )
+                output_message = chunk
+                continue
+
+            assert isinstance(
+                chunk, AgentTurnResponseStreamChunk
+            ), f"Unexpected type {type(chunk)}"
+            event = chunk.event
+            if (
+                event.payload.event_type
+                == AgentTurnResponseEventType.step_complete.value
+            ):
+                steps.append(event.payload.step_details)
+
+            yield chunk
+
+        assert output_message is not None
+
+        turn = Turn(
+            turn_id=turn_id,
+            session_id=request.session_id,
+            input_messages=request.messages,
+            output_message=output_message,
+            started_at=start_time,
+            completed_at=datetime.now(),
+            steps=steps,
+        )
+        await self.storage.add_turn_to_session(request.session_id, turn)
+
+        chunk = AgentTurnResponseStreamChunk(
+            event=AgentTurnResponseEvent(
+                payload=AgentTurnResponseTurnCompletePayload(
+                    turn=turn,
+                )
+            )
+        )
+        yield chunk
+
+    async def run(
+        self,
+        session_id: str,
+        turn_id: str,
+        input_messages: List[Message],
+        attachments: List[Attachment],
+        sampling_params: SamplingParams,
+        stream: bool = False,
+    ) -> AsyncGenerator:
+        # Doing async generators makes downstream code much simpler and everything amenable to
+        # streaming. However, it also makes things complicated here because AsyncGenerators cannot
+        # return a "final value" for the `yield from` statement. we simulate that by yielding a
+        # final boolean (to see whether an exception happened) and then explicitly testing for it.
+
+        async for res in self.run_multiple_shields_wrapper(
+            turn_id, input_messages, self.input_shields, "user-input"
+        ):
+            if isinstance(res, bool):
+                return
+            else:
+                yield res
+
+        async for res in self._run(
+            session_id, turn_id, input_messages, attachments, sampling_params, stream
+        ):
+            if isinstance(res, bool):
+                return
+            elif isinstance(res, CompletionMessage):
+                final_response = res
+                break
+            else:
+                yield res
+
+        assert final_response is not None
+        # for output shields run on the full input and output combination
+        messages = input_messages + [final_response]
+
+        async for res in self.run_multiple_shields_wrapper(
+            turn_id, messages, self.output_shields, "assistant-output"
+        ):
+            if isinstance(res, bool):
+                return
+            else:
+                yield res
+
+        yield final_response
+
+    @tracing.span("run_shields")
+    async def run_multiple_shields_wrapper(
+        self,
+        turn_id: str,
+        messages: List[Message],
+        shields: List[str],
+        touchpoint: str,
+    ) -> AsyncGenerator:
+        if len(shields) == 0:
+            return
+
+        step_id = str(uuid.uuid4())
+        try:
+            yield AgentTurnResponseStreamChunk(
+                event=AgentTurnResponseEvent(
+                    payload=AgentTurnResponseStepStartPayload(
+                        step_type=StepType.shield_call.value,
+                        step_id=step_id,
+                        metadata=dict(touchpoint=touchpoint),
+                    )
+                )
+            )
+            await self.run_multiple_shields(messages, shields)
+
+        except SafetyException as e:
+            yield AgentTurnResponseStreamChunk(
+                event=AgentTurnResponseEvent(
+                    payload=AgentTurnResponseStepCompletePayload(
+                        step_type=StepType.shield_call.value,
+                        step_details=ShieldCallStep(
+                            step_id=step_id,
+                            turn_id=turn_id,
+                            violation=e.violation,
+                        ),
+                    )
+                )
+            )
+
+            yield CompletionMessage(
+                content=str(e),
+                stop_reason=StopReason.end_of_turn,
+            )
+            yield False
+
+        yield AgentTurnResponseStreamChunk(
+            event=AgentTurnResponseEvent(
+                payload=AgentTurnResponseStepCompletePayload(
+                    step_type=StepType.shield_call.value,
+                    step_details=ShieldCallStep(
+                        step_id=step_id,
+                        turn_id=turn_id,
+                        violation=None,
+                    ),
+                )
+            )
+        )
+
+    async def _run(
+        self,
+        session_id: str,
+        turn_id: str,
+        input_messages: List[Message],
+        attachments: List[Attachment],
+        sampling_params: SamplingParams,
+        stream: bool = False,
+    ) -> AsyncGenerator:
+        enabled_tools = set(t.type for t in self.agent_config.tools)
+        need_rag_context = await self._should_retrieve_context(
+            input_messages, attachments
+        )
+        if need_rag_context:
+            step_id = str(uuid.uuid4())
+            yield AgentTurnResponseStreamChunk(
+                event=AgentTurnResponseEvent(
+                    payload=AgentTurnResponseStepStartPayload(
+                        step_type=StepType.memory_retrieval.value,
+                        step_id=step_id,
+                    )
+                )
+            )
+
+            # TODO: find older context from the session and either replace it
+            # or append with a sliding window. this is really a very simplistic implementation
+            with tracing.span("retrieve_rag_context"):
+                rag_context, bank_ids = await self._retrieve_context(
+                    session_id, input_messages, attachments
+                )
+
+            step_id = str(uuid.uuid4())
+            yield AgentTurnResponseStreamChunk(
+                event=AgentTurnResponseEvent(
+                    payload=AgentTurnResponseStepCompletePayload(
+                        step_type=StepType.memory_retrieval.value,
+                        step_id=step_id,
+                        step_details=MemoryRetrievalStep(
+                            turn_id=turn_id,
+                            step_id=step_id,
+                            memory_bank_ids=bank_ids,
+                            inserted_context=rag_context or "",
+                        ),
+                    )
+                )
+            )
+
+            if rag_context:
+                last_message = input_messages[-1]
+                last_message.context = "\n".join(rag_context)
+
+        elif attachments and AgentTool.code_interpreter.value in enabled_tools:
+            urls = [a.content for a in attachments if isinstance(a.content, URL)]
+            # TODO: we need to migrate URL away from str type
+            pattern = re.compile("^(https?://|file://|data:)")
+            urls += [
+                URL(uri=a.content) for a in attachments if pattern.match(a.content)
+            ]
+            msg = await attachment_message(self.tempdir, urls)
+            input_messages.append(msg)
+
+        output_attachments = []
+
+        n_iter = 0
+        while True:
+            msg = input_messages[-1]
+            if msg.role == Role.user.value:
+                color = "blue"
+            elif msg.role == Role.ipython.value:
+                color = "yellow"
+            else:
+                color = None
+            if len(str(msg)) > 1000:
+                msg_str = f"{str(msg)[:500]}...<more>...{str(msg)[-500:]}"
+            else:
+                msg_str = str(msg)
+            cprint(f"{msg_str}", color=color)
+
+            step_id = str(uuid.uuid4())
+            yield AgentTurnResponseStreamChunk(
+                event=AgentTurnResponseEvent(
+                    payload=AgentTurnResponseStepStartPayload(
+                        step_type=StepType.inference.value,
+                        step_id=step_id,
+                    )
+                )
+            )
+
+            tool_calls = []
+            content = ""
+            stop_reason = None
+
+            with tracing.span("inference"):
+                async for chunk in await self.inference_api.chat_completion(
+                    self.agent_config.model,
+                    input_messages,
+                    tools=self._get_tools(),
+                    tool_prompt_format=self.agent_config.tool_prompt_format,
+                    stream=True,
+                    sampling_params=sampling_params,
+                ):
+                    event = chunk.event
+                    if event.event_type == ChatCompletionResponseEventType.start:
+                        continue
+                    elif event.event_type == ChatCompletionResponseEventType.complete:
+                        stop_reason = StopReason.end_of_turn
+                        continue
+
+                    delta = event.delta
+                    if isinstance(delta, ToolCallDelta):
+                        if delta.parse_status == ToolCallParseStatus.success:
+                            tool_calls.append(delta.content)
+
+                        if stream:
+                            yield AgentTurnResponseStreamChunk(
+                                event=AgentTurnResponseEvent(
+                                    payload=AgentTurnResponseStepProgressPayload(
+                                        step_type=StepType.inference.value,
+                                        step_id=step_id,
+                                        model_response_text_delta="",
+                                        tool_call_delta=delta,
+                                    )
+                                )
+                            )
+
+                    elif isinstance(delta, str):
+                        content += delta
+                        if stream and event.stop_reason is None:
+                            yield AgentTurnResponseStreamChunk(
+                                event=AgentTurnResponseEvent(
+                                    payload=AgentTurnResponseStepProgressPayload(
+                                        step_type=StepType.inference.value,
+                                        step_id=step_id,
+                                        model_response_text_delta=event.delta,
+                                    )
+                                )
+                            )
+                    else:
+                        raise ValueError(f"Unexpected delta type {type(delta)}")
+
+                    if event.stop_reason is not None:
+                        stop_reason = event.stop_reason
+
+            stop_reason = stop_reason or StopReason.out_of_tokens
+
+            # If tool calls are parsed successfully,
+            # if content is not made null the tool call str will also be in the content
+            # and tokens will have tool call syntax included twice
+            if tool_calls:
+                content = ""
+
+            message = CompletionMessage(
+                content=content,
+                stop_reason=stop_reason,
+                tool_calls=tool_calls,
+            )
+
+            yield AgentTurnResponseStreamChunk(
+                event=AgentTurnResponseEvent(
+                    payload=AgentTurnResponseStepCompletePayload(
+                        step_type=StepType.inference.value,
+                        step_id=step_id,
+                        step_details=InferenceStep(
+                            # somewhere deep, we are re-assigning message or closing over some
+                            # variable which causes message to mutate later on. fix with a
+                            # `deepcopy` for now, but this is symptomatic of a deeper issue.
+                            step_id=step_id,
+                            turn_id=turn_id,
+                            model_response=copy.deepcopy(message),
+                        ),
+                    )
+                )
+            )
+
+            if n_iter >= self.agent_config.max_infer_iters:
+                cprint("Done with MAX iterations, exiting.")
+                yield message
+                break
+
+            if stop_reason == StopReason.out_of_tokens:
+                cprint("Out of token budget, exiting.")
+                yield message
+                break
+
+            if len(message.tool_calls) == 0:
+                if stop_reason == StopReason.end_of_turn:
+                    # TODO: UPDATE RETURN TYPE TO SEND A TUPLE OF (MESSAGE, ATTACHMENTS)
+                    if len(output_attachments) > 0:
+                        if isinstance(message.content, list):
+                            message.content += attachments
+                        else:
+                            message.content = [message.content] + attachments
+                    yield message
+                else:
+                    cprint(f"Partial message: {str(message)}", color="green")
+                    input_messages = input_messages + [message]
+            else:
+                cprint(f"{str(message)}", color="green")
+                try:
+                    tool_call = message.tool_calls[0]
+
+                    name = tool_call.tool_name
+                    if not isinstance(name, BuiltinTool):
+                        yield message
+                        return
+
+                    step_id = str(uuid.uuid4())
+                    yield AgentTurnResponseStreamChunk(
+                        event=AgentTurnResponseEvent(
+                            payload=AgentTurnResponseStepStartPayload(
+                                step_type=StepType.tool_execution.value,
+                                step_id=step_id,
+                            )
+                        )
+                    )
+                    yield AgentTurnResponseStreamChunk(
+                        event=AgentTurnResponseEvent(
+                            payload=AgentTurnResponseStepProgressPayload(
+                                step_type=StepType.tool_execution.value,
+                                step_id=step_id,
+                                tool_call=tool_call,
+                            )
+                        )
+                    )
+
+                    with tracing.span("tool_execution"):
+                        result_messages = await execute_tool_call_maybe(
+                            self.tools_dict,
+                            [message],
+                        )
+                        assert (
+                            len(result_messages) == 1
+                        ), "Currently not supporting multiple messages"
+                        result_message = result_messages[0]
+
+                    yield AgentTurnResponseStreamChunk(
+                        event=AgentTurnResponseEvent(
+                            payload=AgentTurnResponseStepCompletePayload(
+                                step_type=StepType.tool_execution.value,
+                                step_details=ToolExecutionStep(
+                                    step_id=step_id,
+                                    turn_id=turn_id,
+                                    tool_calls=[tool_call],
+                                    tool_responses=[
+                                        ToolResponse(
+                                            call_id=result_message.call_id,
+                                            tool_name=result_message.tool_name,
+                                            content=result_message.content,
+                                        )
+                                    ],
+                                ),
+                            )
+                        )
+                    )
+
+                    # TODO: add tool-input touchpoint and a "start" event for this step also
+                    # but that needs a lot more refactoring of Tool code potentially
+                    yield AgentTurnResponseStreamChunk(
+                        event=AgentTurnResponseEvent(
+                            payload=AgentTurnResponseStepCompletePayload(
+                                step_type=StepType.shield_call.value,
+                                step_details=ShieldCallStep(
+                                    step_id=str(uuid.uuid4()),
+                                    turn_id=turn_id,
+                                    violation=None,
+                                ),
+                            )
+                        )
+                    )
+
+                except SafetyException as e:
+                    yield AgentTurnResponseStreamChunk(
+                        event=AgentTurnResponseEvent(
+                            payload=AgentTurnResponseStepCompletePayload(
+                                step_type=StepType.shield_call.value,
+                                step_details=ShieldCallStep(
+                                    step_id=str(uuid.uuid4()),
+                                    turn_id=turn_id,
+                                    violation=e.violation,
+                                ),
+                            )
+                        )
+                    )
+
+                    yield CompletionMessage(
+                        content=str(e),
+                        stop_reason=StopReason.end_of_turn,
+                    )
+                    yield False
+                    return
+
+                if out_attachment := interpret_content_as_attachment(
+                    result_message.content
+                ):
+                    # NOTE: when we push this message back to the model, the model may ignore the
+                    # attached file path etc. since the model is trained to only provide a user message
+                    # with the summary. We keep all generated attachments and then attach them to final message
+                    output_attachments.append(out_attachment)
+
+                input_messages = input_messages + [message, result_message]
+
+            n_iter += 1
+
+    async def _ensure_memory_bank(self, session_id: str) -> str:
+        session_info = await self.storage.get_session_info(session_id)
+        if session_info is None:
+            raise ValueError(f"Session {session_id} not found")
+
+        if session_info.memory_bank_id is None:
+            bank_id = f"memory_bank_{session_id}"
+            memory_bank = VectorMemoryBankDef(
+                identifier=bank_id,
+                embedding_model="all-MiniLM-L6-v2",
+                chunk_size_in_tokens=512,
+            )
+            await self.memory_banks_api.register_memory_bank(memory_bank)
+            await self.storage.add_memory_bank_to_session(session_id, bank_id)
+        else:
+            bank_id = session_info.memory_bank_id
+
+        return bank_id
+
+    async def _should_retrieve_context(
+        self, messages: List[Message], attachments: List[Attachment]
+    ) -> bool:
+        enabled_tools = set(t.type for t in self.agent_config.tools)
+        if attachments:
+            if (
+                AgentTool.code_interpreter.value in enabled_tools
+                and self.agent_config.tool_choice == ToolChoice.required
+            ):
+                return False
+            else:
+                return True
+
+        return AgentTool.memory.value in enabled_tools
+
+    def _memory_tool_definition(self) -> Optional[MemoryToolDefinition]:
+        for t in self.agent_config.tools:
+            if t.type == AgentTool.memory.value:
+                return t
+
+        return None
+
+    async def _retrieve_context(
+        self, session_id: str, messages: List[Message], attachments: List[Attachment]
+    ) -> Tuple[Optional[List[str]], Optional[List[int]]]:  # (rag_context, bank_ids)
+        bank_ids = []
+
+        memory = self._memory_tool_definition()
+        assert memory is not None, "Memory tool not configured"
+        bank_ids.extend(c.bank_id for c in memory.memory_bank_configs)
+
+        if attachments:
+            bank_id = await self._ensure_memory_bank(session_id)
+            bank_ids.append(bank_id)
+
+            documents = [
+                MemoryBankDocument(
+                    document_id=str(uuid.uuid4()),
+                    content=a.content,
+                    mime_type=a.mime_type,
+                    metadata={},
+                )
+                for a in attachments
+            ]
+            with tracing.span("insert_documents"):
+                await self.memory_api.insert_documents(bank_id, documents)
+        else:
+            session_info = await self.storage.get_session_info(session_id)
+            if session_info.memory_bank_id:
+                bank_ids.append(session_info.memory_bank_id)
+
+        if not bank_ids:
+            # this can happen if the per-session memory bank is not yet populated
+            # (i.e., no prior turns uploaded an Attachment)
+            return None, []
+
+        query = await generate_rag_query(
+            memory.query_generator_config, messages, inference_api=self.inference_api
+        )
+        tasks = [
+            self.memory_api.query_documents(
+                bank_id=bank_id,
+                query=query,
+                params={
+                    "max_chunks": 5,
+                },
+            )
+            for bank_id in bank_ids
+        ]
+        results: List[QueryDocumentsResponse] = await asyncio.gather(*tasks)
+        chunks = [c for r in results for c in r.chunks]
+        scores = [s for r in results for s in r.scores]
+
+        if not chunks:
+            return None, bank_ids
+
+        # sort by score
+        chunks, scores = zip(
+            *sorted(zip(chunks, scores), key=lambda x: x[1], reverse=True)
+        )
+
+        tokens = 0
+        picked = []
+        for c in chunks[: memory.max_chunks]:
+            tokens += c.token_count
+            if tokens > memory.max_tokens_in_context:
+                cprint(
+                    f"Using {len(picked)} chunks; reached max tokens in context: {tokens}",
+                    "red",
+                )
+                break
+            picked.append(f"id:{c.document_id}; content:{c.content}")
+
+        return [
+            "Here are the retrieved documents for relevant context:\n=== START-RETRIEVED-CONTEXT ===\n",
+            *picked,
+            "\n=== END-RETRIEVED-CONTEXT ===\n",
+        ], bank_ids
+
+    def _get_tools(self) -> List[ToolDefinition]:
+        ret = []
+        for t in self.agent_config.tools:
+            if isinstance(t, SearchToolDefinition):
+                ret.append(ToolDefinition(tool_name=BuiltinTool.brave_search))
+            elif isinstance(t, WolframAlphaToolDefinition):
+                ret.append(ToolDefinition(tool_name=BuiltinTool.wolfram_alpha))
+            elif isinstance(t, PhotogenToolDefinition):
+                ret.append(ToolDefinition(tool_name=BuiltinTool.photogen))
+            elif isinstance(t, CodeInterpreterToolDefinition):
+                ret.append(ToolDefinition(tool_name=BuiltinTool.code_interpreter))
+            elif isinstance(t, FunctionCallToolDefinition):
+                ret.append(
+                    ToolDefinition(
+                        tool_name=t.function_name,
+                        description=t.description,
+                        parameters=t.parameters,
+                    )
+                )
+        return ret
+
+
+async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessage:
+    content = []
+
+    for url in urls:
+        uri = url.uri
+        if uri.startswith("file://"):
+            filepath = uri[len("file://") :]
+        elif uri.startswith("http"):
+            path = urlparse(uri).path
+            basename = os.path.basename(path)
+            filepath = f"{tempdir}/{make_random_string() + basename}"
+            print(f"Downloading {url} -> {filepath}")
+
+            async with httpx.AsyncClient() as client:
+                r = await client.get(uri)
+                resp = r.text
+                with open(filepath, "w") as fp:
+                    fp.write(resp)
+        else:
+            raise ValueError(f"Unsupported URL {url}")
+
+        content.append(f'# There is a file accessible to you at "{filepath}"\n')
+
+    return ToolResponseMessage(
+        call_id="",
+        tool_name=BuiltinTool.code_interpreter,
+        content=content,
+    )
+
+
+async def execute_tool_call_maybe(
+    tools_dict: Dict[str, BaseTool], messages: List[CompletionMessage]
+) -> List[ToolResponseMessage]:
+    # While Tools.run interface takes a list of messages,
+    # All tools currently only run on a single message
+    # When this changes, we can drop this assert
+    # Whether to call tools on each message and aggregate
+    # or aggregate and call tool once, reamins to be seen.
+    assert len(messages) == 1, "Expected single message"
+    message = messages[0]
+
+    tool_call = message.tool_calls[0]
+    name = tool_call.tool_name
+    assert isinstance(name, BuiltinTool)
+
+    name = name.value
+
+    assert name in tools_dict, f"Tool {name} not found"
+    tool = tools_dict[name]
+    result_messages = await tool.run(messages)
+    return result_messages
+
+
+def print_dialog(messages: List[Message]):
+    for i, m in enumerate(messages):
+        if m.role == Role.user.value:
+            color = "red"
+        elif m.role == Role.assistant.value:
+            color = "white"
+        elif m.role == Role.ipython.value:
+            color = "yellow"
+        elif m.role == Role.system.value:
+            color = "green"
+        else:
+            color = "white"
+
+        s = str(m)
+        cprint(f"{i} ::: {s[:100]}...", color=color)
--- a/llama_stack/providers/inline/meta_reference/agents/agents.py
+++ b/llama_stack/providers/inline/meta_reference/agents/agents.py
@ -0,0 +1,193 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import logging
+import uuid
+from typing import AsyncGenerator
+
+from llama_stack.apis.inference import Inference
+from llama_stack.apis.memory import Memory
+from llama_stack.apis.memory_banks import MemoryBanks
+from llama_stack.apis.safety import Safety
+from llama_stack.apis.agents import *  # noqa: F403
+
+from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_impl
+
+from .agent_instance import ChatAgent
+from .config import MetaReferenceAgentsImplConfig
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+class MetaReferenceAgentsImpl(Agents):
+    def __init__(
+        self,
+        config: MetaReferenceAgentsImplConfig,
+        inference_api: Inference,
+        memory_api: Memory,
+        safety_api: Safety,
+        memory_banks_api: MemoryBanks,
+    ):
+        self.config = config
+        self.inference_api = inference_api
+        self.memory_api = memory_api
+        self.safety_api = safety_api
+        self.memory_banks_api = memory_banks_api
+
+        self.in_memory_store = InmemoryKVStoreImpl()
+
+    async def initialize(self) -> None:
+        self.persistence_store = await kvstore_impl(self.config.persistence_store)
+
+    async def create_agent(
+        self,
+        agent_config: AgentConfig,
+    ) -> AgentCreateResponse:
+        agent_id = str(uuid.uuid4())
+
+        await self.persistence_store.set(
+            key=f"agent:{agent_id}",
+            value=agent_config.json(),
+        )
+        return AgentCreateResponse(
+            agent_id=agent_id,
+        )
+
+    async def get_agent(self, agent_id: str) -> ChatAgent:
+        agent_config = await self.persistence_store.get(
+            key=f"agent:{agent_id}",
+        )
+        if not agent_config:
+            raise ValueError(f"Could not find agent config for {agent_id}")
+
+        try:
+            agent_config = json.loads(agent_config)
+        except json.JSONDecodeError as e:
+            raise ValueError(
+                f"Could not JSON decode agent config for {agent_id}"
+            ) from e
+
+        try:
+            agent_config = AgentConfig(**agent_config)
+        except Exception as e:
+            raise ValueError(
+                f"Could not validate(?) agent config for {agent_id}"
+            ) from e
+
+        return ChatAgent(
+            agent_id=agent_id,
+            agent_config=agent_config,
+            inference_api=self.inference_api,
+            safety_api=self.safety_api,
+            memory_api=self.memory_api,
+            memory_banks_api=self.memory_banks_api,
+            persistence_store=(
+                self.persistence_store
+                if agent_config.enable_session_persistence
+                else self.in_memory_store
+            ),
+        )
+
+    async def create_agent_session(
+        self,
+        agent_id: str,
+        session_name: str,
+    ) -> AgentSessionCreateResponse:
+        agent = await self.get_agent(agent_id)
+
+        session_id = await agent.create_session(session_name)
+        return AgentSessionCreateResponse(
+            session_id=session_id,
+        )
+
+    async def create_agent_turn(
+        self,
+        agent_id: str,
+        session_id: str,
+        messages: List[
+            Union[
+                UserMessage,
+                ToolResponseMessage,
+            ]
+        ],
+        attachments: Optional[List[Attachment]] = None,
+        stream: Optional[bool] = False,
+    ) -> AsyncGenerator:
+        request = AgentTurnCreateRequest(
+            agent_id=agent_id,
+            session_id=session_id,
+            messages=messages,
+            attachments=attachments,
+            stream=True,
+        )
+        if stream:
+            return self._create_agent_turn_streaming(request)
+        else:
+            raise NotImplementedError("Non-streaming agent turns not yet implemented")
+
+    async def _create_agent_turn_streaming(
+        self,
+        request: AgentTurnCreateRequest,
+    ) -> AsyncGenerator:
+        agent = await self.get_agent(request.agent_id)
+        async for event in agent.create_and_execute_turn(request):
+            yield event
+
+    async def get_agents_turn(
+        self, agent_id: str, session_id: str, turn_id: str
+    ) -> Turn:
+        turn = await self.persistence_store.get(
+            f"session:{agent_id}:{session_id}:{turn_id}"
+        )
+        turn = json.loads(turn)
+        turn = Turn(**turn)
+        return turn
+
+    async def get_agents_step(
+        self, agent_id: str, session_id: str, turn_id: str, step_id: str
+    ) -> AgentStepResponse:
+        turn = await self.persistence_store.get(
+            f"session:{agent_id}:{session_id}:{turn_id}"
+        )
+        turn = json.loads(turn)
+        turn = Turn(**turn)
+        steps = turn.steps
+        for step in steps:
+            if step.step_id == step_id:
+                return AgentStepResponse(step=step)
+        raise ValueError(f"Provided step_id {step_id} could not be found")
+
+    async def get_agents_session(
+        self,
+        agent_id: str,
+        session_id: str,
+        turn_ids: Optional[List[str]] = None,
+    ) -> Session:
+        session = await self.persistence_store.get(f"session:{agent_id}:{session_id}")
+        session = Session(**json.loads(session), turns=[])
+        turns = []
+        if turn_ids:
+            for turn_id in turn_ids:
+                turn = await self.persistence_store.get(
+                    f"session:{agent_id}:{session_id}:{turn_id}"
+                )
+                turn = json.loads(turn)
+                turn = Turn(**turn)
+                turns.append(turn)
+        return Session(
+            session_name=session.session_name,
+            session_id=session_id,
+            turns=turns if turns else [],
+            started_at=session.started_at,
+        )
+
+    async def delete_agents_session(self, agent_id: str, session_id: str) -> None:
+        await self.persistence_store.delete(f"session:{agent_id}:{session_id}")
+
+    async def delete_agents(self, agent_id: str) -> None:
+        await self.persistence_store.delete(f"agent:{agent_id}")
--- a/llama_stack/providers/inline/meta_reference/agents/config.py
+++ b/llama_stack/providers/inline/meta_reference/agents/config.py
@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel, Field
+
+from llama_stack.providers.utils.kvstore import KVStoreConfig
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+
+
+class MetaReferenceAgentsImplConfig(BaseModel):
+    persistence_store: KVStoreConfig = Field(default=SqliteKVStoreConfig())
--- a/llama_stack/providers/inline/meta_reference/agents/persistence.py
+++ b/llama_stack/providers/inline/meta_reference/agents/persistence.py
@ -0,0 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+
+import uuid
+from datetime import datetime
+
+from typing import List, Optional
+from llama_stack.apis.agents import *  # noqa: F403
+from pydantic import BaseModel
+
+from llama_stack.providers.utils.kvstore import KVStore
+
+
+class AgentSessionInfo(BaseModel):
+    session_id: str
+    session_name: str
+    memory_bank_id: Optional[str] = None
+    started_at: datetime
+
+
+class AgentPersistence:
+    def __init__(self, agent_id: str, kvstore: KVStore):
+        self.agent_id = agent_id
+        self.kvstore = kvstore
+
+    async def create_session(self, name: str) -> str:
+        session_id = str(uuid.uuid4())
+        session_info = AgentSessionInfo(
+            session_id=session_id,
+            session_name=name,
+            started_at=datetime.now(),
+        )
+        await self.kvstore.set(
+            key=f"session:{self.agent_id}:{session_id}",
+            value=session_info.json(),
+        )
+        return session_id
+
+    async def get_session_info(self, session_id: str) -> Optional[AgentSessionInfo]:
+        value = await self.kvstore.get(
+            key=f"session:{self.agent_id}:{session_id}",
+        )
+        if not value:
+            return None
+
+        return AgentSessionInfo(**json.loads(value))
+
+    async def add_memory_bank_to_session(self, session_id: str, bank_id: str):
+        session_info = await self.get_session_info(session_id)
+        if session_info is None:
+            raise ValueError(f"Session {session_id} not found")
+
+        session_info.memory_bank_id = bank_id
+        await self.kvstore.set(
+            key=f"session:{self.agent_id}:{session_id}",
+            value=session_info.json(),
+        )
+
+    async def add_turn_to_session(self, session_id: str, turn: Turn):
+        await self.kvstore.set(
+            key=f"session:{self.agent_id}:{session_id}:{turn.turn_id}",
+            value=turn.json(),
+        )
+
+    async def get_session_turns(self, session_id: str) -> List[Turn]:
+        values = await self.kvstore.range(
+            start_key=f"session:{self.agent_id}:{session_id}:",
+            end_key=f"session:{self.agent_id}:{session_id}:\xff\xff\xff\xff",
+        )
+        turns = []
+        for value in values:
+            try:
+                turn = Turn(**json.loads(value))
+                turns.append(turn)
+            except Exception as e:
+                print(f"Error parsing turn: {e}")
+                continue
+
+        return turns
--- a/llama_stack/providers/inline/meta_reference/agents/rag/init.py
+++ b/llama_stack/providers/inline/meta_reference/agents/rag/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/inline/meta_reference/agents/rag/context_retriever.py
+++ b/llama_stack/providers/inline/meta_reference/agents/rag/context_retriever.py
@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from jinja2 import Template
+from llama_models.llama3.api import *  # noqa: F403
+
+
+from termcolor import cprint  # noqa: F401
+
+from llama_stack.apis.agents import (
+    DefaultMemoryQueryGeneratorConfig,
+    LLMMemoryQueryGeneratorConfig,
+    MemoryQueryGenerator,
+    MemoryQueryGeneratorConfig,
+)
+from llama_stack.apis.inference import *  # noqa: F403
+
+
+async def generate_rag_query(
+    config: MemoryQueryGeneratorConfig,
+    messages: List[Message],
+    **kwargs,
+):
+    """
+    Generates a query that will be used for
+    retrieving relevant information from the memory bank.
+    """
+    if config.type == MemoryQueryGenerator.default.value:
+        query = await default_rag_query_generator(config, messages, **kwargs)
+    elif config.type == MemoryQueryGenerator.llm.value:
+        query = await llm_rag_query_generator(config, messages, **kwargs)
+    else:
+        raise NotImplementedError(f"Unsupported memory query generator {config.type}")
+    # cprint(f"Generated query >>>: {query}", color="green")
+    return query
+
+
+async def default_rag_query_generator(
+    config: DefaultMemoryQueryGeneratorConfig,
+    messages: List[Message],
+    **kwargs,
+):
+    return config.sep.join(interleaved_text_media_as_str(m.content) for m in messages)
+
+
+async def llm_rag_query_generator(
+    config: LLMMemoryQueryGeneratorConfig,
+    messages: List[Message],
+    **kwargs,
+):
+    assert "inference_api" in kwargs, "LLMRAGQueryGenerator needs inference_api"
+    inference_api = kwargs["inference_api"]
+
+    m_dict = {"messages": [m.model_dump() for m in messages]}
+
+    template = Template(config.template)
+    content = template.render(m_dict)
+
+    model = config.model
+    message = UserMessage(content=content)
+    response = await inference_api.chat_completion(
+        model=model,
+        messages=[message],
+        stream=False,
+    )
+
+    query = response.completion_message.content
+
+    return query
--- a/llama_stack/providers/inline/meta_reference/agents/safety.py
+++ b/llama_stack/providers/inline/meta_reference/agents/safety.py
@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+
+from typing import List
+
+from llama_models.llama3.api.datatypes import Message
+from termcolor import cprint
+
+from llama_stack.apis.safety import *  # noqa: F403
+
+
+class SafetyException(Exception):  # noqa: N818
+    def __init__(self, violation: SafetyViolation):
+        self.violation = violation
+        super().__init__(violation.user_message)
+
+
+class ShieldRunnerMixin:
+    def __init__(
+        self,
+        safety_api: Safety,
+        input_shields: List[str] = None,
+        output_shields: List[str] = None,
+    ):
+        self.safety_api = safety_api
+        self.input_shields = input_shields
+        self.output_shields = output_shields
+
+    async def run_multiple_shields(
+        self, messages: List[Message], identifiers: List[str]
+    ) -> None:
+        responses = await asyncio.gather(
+            *[
+                self.safety_api.run_shield(
+                    identifier=identifier,
+                    messages=messages,
+                )
+                for identifier in identifiers
+            ]
+        )
+        for identifier, response in zip(identifiers, responses):
+            if not response.violation:
+                continue
+
+            violation = response.violation
+            if violation.violation_level == ViolationLevel.ERROR:
+                raise SafetyException(violation)
+            elif violation.violation_level == ViolationLevel.WARN:
+                cprint(
+                    f"[Warn]{identifier} raised a warning",
+                    color="red",
+                )
--- a/llama_stack/providers/inline/meta_reference/agents/tests/init.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tests/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/inline/meta_reference/agents/tests/code_execution.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tests/code_execution.py
@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import unittest
+
+from llama_models.llama3.api.datatypes import (
+    Attachment,
+    BuiltinTool,
+    CompletionMessage,
+    StopReason,
+    ToolCall,
+)
+
+from ..tools.builtin import CodeInterpreterTool
+
+
+class TestCodeInterpreter(unittest.IsolatedAsyncioTestCase):
+    async def test_matplotlib(self):
+        tool = CodeInterpreterTool()
+        code = """
+import matplotlib.pyplot as plt
+import numpy as np
+
+x = np.array([1, 1])
+y = np.array([0, 10])
+
+plt.plot(x, y)
+plt.title('x = 1')
+plt.xlabel('x')
+plt.ylabel('y')
+plt.grid(True)
+plt.axvline(x=1, color='r')
+plt.show()
+        """
+        message = CompletionMessage(
+            role="assistant",
+            content="",
+            tool_calls=[
+                ToolCall(
+                    call_id="call_id",
+                    tool_name=BuiltinTool.code_interpreter,
+                    arguments={"code": code},
+                )
+            ],
+            stop_reason=StopReason.end_of_message,
+        )
+        ret = await tool.run([message])
+
+        self.assertEqual(len(ret), 1)
+
+        output = ret[0].content
+        self.assertIsInstance(output, Attachment)
+        self.assertEqual(output.mime_type, "image/png")
+
+    async def test_path_unlink(self):
+        tool = CodeInterpreterTool()
+        code = """
+import os
+from pathlib import Path
+import tempfile
+
+dpath = Path(os.environ["MPLCONFIGDIR"])
+with open(dpath / "test", "w") as f:
+    f.write("hello")
+
+Path(dpath / "test").unlink()
+print("_OK_")
+        """
+        message = CompletionMessage(
+            role="assistant",
+            content="",
+            tool_calls=[
+                ToolCall(
+                    call_id="call_id",
+                    tool_name=BuiltinTool.code_interpreter,
+                    arguments={"code": code},
+                )
+            ],
+            stop_reason=StopReason.end_of_message,
+        )
+        ret = await tool.run([message])
+
+        self.assertEqual(len(ret), 1)
+
+        output = ret[0].content
+        self.assertTrue("_OK_" in output)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/llama_stack/providers/inline/meta_reference/agents/tests/test_chat_agent.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tests/test_chat_agent.py
@ -0,0 +1,306 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import AsyncIterator, List, Optional, Union
+
+import pytest
+
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.memory import *  # noqa: F403
+from llama_stack.apis.safety import *  # noqa: F403
+from llama_stack.apis.agents import *  # noqa: F403
+
+from ..agents import (
+    AGENT_INSTANCES_BY_ID,
+    MetaReferenceAgentsImpl,
+    MetaReferenceInferenceConfig,
+)
+
+
+class MockInferenceAPI:
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = None,
+        tool_prompt_format: Optional[ToolPromptFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncIterator[
+        Union[ChatCompletionResponseStreamChunk, ChatCompletionResponse]
+    ]:
+        if stream:
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type="start",
+                    delta="",
+                )
+            )
+
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type="progress",
+                    delta="AI is a fascinating field...",
+                )
+            )
+            # yield ChatCompletionResponseStreamChunk(
+            #     event=ChatCompletionResponseEvent(
+            #         event_type="progress",
+            #         delta=ToolCallDelta(
+            #             content=ToolCall(
+            #                 call_id="123",
+            #                 tool_name=BuiltinTool.brave_search.value,
+            #                 arguments={"query": "AI history"},
+            #             ),
+            #             parse_status="success",
+            #         ),
+            #     )
+            # )
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type="complete",
+                    delta="",
+                    stop_reason="end_of_turn",
+                )
+            )
+        else:
+            yield ChatCompletionResponse(
+                completion_message=CompletionMessage(
+                    role="assistant", content="Mock response", stop_reason="end_of_turn"
+                ),
+                logprobs=[0.1, 0.2, 0.3] if logprobs else None,
+            )
+
+
+class MockSafetyAPI:
+    async def run_shield(
+        self, shield_type: str, messages: List[Message]
+    ) -> RunShieldResponse:
+        return RunShieldResponse(violation=None)
+
+
+class MockMemoryAPI:
+    def __init__(self):
+        self.memory_banks = {}
+        self.documents = {}
+
+    async def create_memory_bank(self, name, config, url=None):
+        bank_id = f"bank_{len(self.memory_banks)}"
+        bank = MemoryBank(bank_id, name, config, url)
+        self.memory_banks[bank_id] = bank
+        self.documents[bank_id] = {}
+        return bank
+
+    async def list_memory_banks(self):
+        return list(self.memory_banks.values())
+
+    async def get_memory_bank(self, bank_id):
+        return self.memory_banks.get(bank_id)
+
+    async def drop_memory_bank(self, bank_id):
+        if bank_id in self.memory_banks:
+            del self.memory_banks[bank_id]
+            del self.documents[bank_id]
+        return bank_id
+
+    async def insert_documents(self, bank_id, documents, ttl_seconds=None):
+        if bank_id not in self.documents:
+            raise ValueError(f"Bank {bank_id} not found")
+        for doc in documents:
+            self.documents[bank_id][doc.document_id] = doc
+
+    async def update_documents(self, bank_id, documents):
+        if bank_id not in self.documents:
+            raise ValueError(f"Bank {bank_id} not found")
+        for doc in documents:
+            if doc.document_id in self.documents[bank_id]:
+                self.documents[bank_id][doc.document_id] = doc
+
+    async def query_documents(self, bank_id, query, params=None):
+        if bank_id not in self.documents:
+            raise ValueError(f"Bank {bank_id} not found")
+        # Simple mock implementation: return all documents
+        chunks = [
+            {"content": doc.content, "token_count": 10, "document_id": doc.document_id}
+            for doc in self.documents[bank_id].values()
+        ]
+        scores = [1.0] * len(chunks)
+        return {"chunks": chunks, "scores": scores}
+
+    async def get_documents(self, bank_id, document_ids):
+        if bank_id not in self.documents:
+            raise ValueError(f"Bank {bank_id} not found")
+        return [
+            self.documents[bank_id][doc_id]
+            for doc_id in document_ids
+            if doc_id in self.documents[bank_id]
+        ]
+
+    async def delete_documents(self, bank_id, document_ids):
+        if bank_id not in self.documents:
+            raise ValueError(f"Bank {bank_id} not found")
+        for doc_id in document_ids:
+            self.documents[bank_id].pop(doc_id, None)
+
+
+@pytest.fixture
+def mock_inference_api():
+    return MockInferenceAPI()
+
+
+@pytest.fixture
+def mock_safety_api():
+    return MockSafetyAPI()
+
+
+@pytest.fixture
+def mock_memory_api():
+    return MockMemoryAPI()
+
+
+@pytest.fixture
+async def chat_agent(mock_inference_api, mock_safety_api, mock_memory_api):
+    impl = MetaReferenceAgentsImpl(
+        config=MetaReferenceInferenceConfig(),
+        inference_api=mock_inference_api,
+        safety_api=mock_safety_api,
+        memory_api=mock_memory_api,
+    )
+    await impl.initialize()
+
+    agent_config = AgentConfig(
+        model="test_model",
+        instructions="You are a helpful assistant.",
+        sampling_params=SamplingParams(),
+        tools=[
+            # SearchToolDefinition(
+            #     name="brave_search",
+            #     api_key="test_key",
+            # ),
+        ],
+        tool_choice=ToolChoice.auto,
+        enable_session_persistence=False,
+        input_shields=[],
+        output_shields=[],
+    )
+    response = await impl.create_agent(agent_config)
+    agent = AGENT_INSTANCES_BY_ID[response.agent_id]
+    return agent
+
+
+@pytest.mark.asyncio
+async def test_chat_agent_create_session(chat_agent):
+    session = chat_agent.create_session("Test Session")
+    assert session.session_name == "Test Session"
+    assert session.turns == []
+    assert session.session_id in chat_agent.sessions
+
+
+@pytest.mark.asyncio
+async def test_chat_agent_create_and_execute_turn(chat_agent):
+    session = chat_agent.create_session("Test Session")
+    request = AgentTurnCreateRequest(
+        agent_id="random",
+        session_id=session.session_id,
+        messages=[UserMessage(content="Hello")],
+    )
+
+    responses = []
+    async for response in chat_agent.create_and_execute_turn(request):
+        responses.append(response)
+
+    print(responses)
+    assert len(responses) > 0
+    assert len(responses) == 4  # TurnStart, StepStart, StepComplete, TurnComplete
+    assert responses[0].event.payload.turn_id is not None
+
+
+@pytest.mark.asyncio
+async def test_run_multiple_shields_wrapper(chat_agent):
+    messages = [UserMessage(content="Test message")]
+    shields = ["test_shield"]
+
+    responses = [
+        chunk
+        async for chunk in chat_agent.run_multiple_shields_wrapper(
+            turn_id="test_turn_id",
+            messages=messages,
+            shields=shields,
+            touchpoint="user-input",
+        )
+    ]
+
+    assert len(responses) == 2  # StepStart, StepComplete
+    assert responses[0].event.payload.step_type.value == "shield_call"
+    assert not responses[1].event.payload.step_details.response.is_violation
+
+
+@pytest.mark.asyncio
+@pytest.mark.skip(reason="Not yet implemented; need to mock out tool execution easily")
+async def test_chat_agent_complex_turn(chat_agent):
+    # Setup
+    session = chat_agent.create_session("Test Session")
+    request = AgentTurnCreateRequest(
+        agent_id="random",
+        session_id=session.session_id,
+        messages=[UserMessage(content="Tell me about AI and then use a tool.")],
+        stream=True,
+    )
+
+    # Execute the turn
+    responses = []
+    async for response in chat_agent.create_and_execute_turn(request):
+        responses.append(response)
+
+    # Assertions
+    assert len(responses) > 0
+
+    # Check for the presence of different step types
+    step_types = [
+        response.event.payload.step_type
+        for response in responses
+        if hasattr(response.event.payload, "step_type")
+    ]
+
+    assert "shield_call" in step_types, "Shield call step is missing"
+    assert "inference" in step_types, "Inference step is missing"
+    assert "tool_execution" in step_types, "Tool execution step is missing"
+
+    # Check for the presence of start and complete events
+    event_types = [
+        response.event.payload.event_type
+        for response in responses
+        if hasattr(response.event.payload, "event_type")
+    ]
+    assert "start" in event_types, "Start event is missing"
+    assert "complete" in event_types, "Complete event is missing"
+
+    # Check for the presence of tool call
+    tool_calls = [
+        response.event.payload.tool_call
+        for response in responses
+        if hasattr(response.event.payload, "tool_call")
+    ]
+    assert any(
+        tool_call
+        for tool_call in tool_calls
+        if tool_call and tool_call.content.get("name") == "memory"
+    ), "Memory tool call is missing"
+
+    # Check for the final turn complete event
+    assert any(
+        isinstance(response.event.payload, AgentTurnResponseTurnCompletePayload)
+        for response in responses
+    ), "Turn complete event is missing"
+
+    # Verify the turn was added to the session
+    assert len(session.turns) == 1, "Turn was not added to the session"
+    assert (
+        session.turns[0].input_messages == request.messages
+    ), "Input messages do not match"
--- a/llama_stack/providers/inline/meta_reference/agents/tools/init.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tools/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/inline/meta_reference/agents/tools/base.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tools/base.py
@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from typing import List
+
+from llama_stack.apis.inference import Message
+
+
+class BaseTool(ABC):
+    @abstractmethod
+    def get_name(self) -> str:
+        raise NotImplementedError
+
+    @abstractmethod
+    async def run(self, messages: List[Message]) -> List[Message]:
+        raise NotImplementedError
--- a/llama_stack/providers/inline/meta_reference/agents/tools/builtin.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tools/builtin.py
@ -0,0 +1,375 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import re
+import tempfile
+
+from abc import abstractmethod
+from typing import List, Optional
+
+import requests
+from termcolor import cprint
+
+from .ipython_tool.code_execution import (
+    CodeExecutionContext,
+    CodeExecutionRequest,
+    CodeExecutor,
+    TOOLS_ATTACHMENT_KEY_REGEX,
+)
+
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.agents import *  # noqa: F403
+
+from .base import BaseTool
+
+
+def interpret_content_as_attachment(content: str) -> Optional[Attachment]:
+    match = re.search(TOOLS_ATTACHMENT_KEY_REGEX, content)
+    if match:
+        snippet = match.group(1)
+        data = json.loads(snippet)
+        return Attachment(
+            content=URL(uri="file://" + data["filepath"]), mime_type=data["mimetype"]
+        )
+
+    return None
+
+
+class SingleMessageBuiltinTool(BaseTool):
+    async def run(self, messages: List[CompletionMessage]) -> List[ToolResponseMessage]:
+        assert len(messages) == 1, f"Expected single message, got {len(messages)}"
+
+        message = messages[0]
+        assert len(message.tool_calls) == 1, "Expected a single tool call"
+
+        tool_call = messages[0].tool_calls[0]
+
+        query = tool_call.arguments["query"]
+        response: str = await self.run_impl(query)
+
+        message = ToolResponseMessage(
+            call_id=tool_call.call_id,
+            tool_name=tool_call.tool_name,
+            content=response,
+        )
+        return [message]
+
+    @abstractmethod
+    async def run_impl(self, query: str) -> str:
+        raise NotImplementedError()
+
+
+class PhotogenTool(SingleMessageBuiltinTool):
+    def __init__(self, dump_dir: str) -> None:
+        self.dump_dir = dump_dir
+
+    def get_name(self) -> str:
+        return BuiltinTool.photogen.value
+
+    async def run_impl(self, query: str) -> str:
+        """
+        Implement this to give the model an ability to generate images.
+
+        Return:
+            info = {
+                "filepath": str(image_filepath),
+                "mimetype": "image/png",
+            }
+        """
+        raise NotImplementedError()
+
+
+class SearchTool(SingleMessageBuiltinTool):
+    def __init__(self, engine: SearchEngineType, api_key: str, **kwargs) -> None:
+        self.api_key = api_key
+        if engine == SearchEngineType.bing:
+            self.engine = BingSearch(api_key, **kwargs)
+        elif engine == SearchEngineType.brave:
+            self.engine = BraveSearch(api_key, **kwargs)
+        else:
+            raise ValueError(f"Unknown search engine: {engine}")
+
+    def get_name(self) -> str:
+        return BuiltinTool.brave_search.value
+
+    async def run_impl(self, query: str) -> str:
+        return await self.engine.search(query)
+
+
+class BingSearch:
+    def __init__(self, api_key: str, top_k: int = 3, **kwargs) -> None:
+        self.api_key = api_key
+        self.top_k = top_k
+
+    async def search(self, query: str) -> str:
+        url = "https://api.bing.microsoft.com/v7.0/search"
+        headers = {
+            "Ocp-Apim-Subscription-Key": self.api_key,
+        }
+        params = {
+            "count": self.top_k,
+            "textDecorations": True,
+            "textFormat": "HTML",
+            "q": query,
+        }
+
+        response = requests.get(url=url, params=params, headers=headers)
+        response.raise_for_status()
+        clean = self._clean_response(response.json())
+        return json.dumps(clean)
+
+    def _clean_response(self, search_response):
+        clean_response = []
+        query = search_response["queryContext"]["originalQuery"]
+        if "webPages" in search_response:
+            pages = search_response["webPages"]["value"]
+            for p in pages:
+                selected_keys = {"name", "url", "snippet"}
+                clean_response.append(
+                    {k: v for k, v in p.items() if k in selected_keys}
+                )
+        if "news" in search_response:
+            clean_news = []
+            news = search_response["news"]["value"]
+            for n in news:
+                selected_keys = {"name", "url", "description"}
+                clean_news.append({k: v for k, v in n.items() if k in selected_keys})
+
+            clean_response.append(clean_news)
+
+        return {"query": query, "top_k": clean_response}
+
+
+class BraveSearch:
+    def __init__(self, api_key: str) -> None:
+        self.api_key = api_key
+
+    async def search(self, query: str) -> str:
+        url = "https://api.search.brave.com/res/v1/web/search"
+        headers = {
+            "X-Subscription-Token": self.api_key,
+            "Accept-Encoding": "gzip",
+            "Accept": "application/json",
+        }
+        payload = {"q": query}
+        response = requests.get(url=url, params=payload, headers=headers)
+        return json.dumps(self._clean_brave_response(response.json()))
+
+    def _clean_brave_response(self, search_response, top_k=3):
+        query = None
+        clean_response = []
+        if "query" in search_response:
+            if "original" in search_response["query"]:
+                query = search_response["query"]["original"]
+        if "mixed" in search_response:
+            mixed_results = search_response["mixed"]
+            for m in mixed_results["main"][:top_k]:
+                r_type = m["type"]
+                results = search_response[r_type]["results"]
+                if r_type == "web":
+                    # For web data - add a single output from the search
+                    idx = m["index"]
+                    selected_keys = [
+                        "type",
+                        "title",
+                        "url",
+                        "description",
+                        "date",
+                        "extra_snippets",
+                    ]
+                    cleaned = {
+                        k: v for k, v in results[idx].items() if k in selected_keys
+                    }
+                elif r_type == "faq":
+                    # For faw data - take a list of all the questions & answers
+                    selected_keys = ["type", "question", "answer", "title", "url"]
+                    cleaned = []
+                    for q in results:
+                        cleaned.append(
+                            {k: v for k, v in q.items() if k in selected_keys}
+                        )
+                elif r_type == "infobox":
+                    idx = m["index"]
+                    selected_keys = [
+                        "type",
+                        "title",
+                        "url",
+                        "description",
+                        "long_desc",
+                    ]
+                    cleaned = {
+                        k: v for k, v in results[idx].items() if k in selected_keys
+                    }
+                elif r_type == "videos":
+                    selected_keys = [
+                        "type",
+                        "url",
+                        "title",
+                        "description",
+                        "date",
+                    ]
+                    cleaned = []
+                    for q in results:
+                        cleaned.append(
+                            {k: v for k, v in q.items() if k in selected_keys}
+                        )
+                elif r_type == "locations":
+                    # For faw data - take a list of all the questions & answers
+                    selected_keys = [
+                        "type",
+                        "title",
+                        "url",
+                        "description",
+                        "coordinates",
+                        "postal_address",
+                        "contact",
+                        "rating",
+                        "distance",
+                        "zoom_level",
+                    ]
+                    cleaned = []
+                    for q in results:
+                        cleaned.append(
+                            {k: v for k, v in q.items() if k in selected_keys}
+                        )
+                elif r_type == "news":
+                    # For faw data - take a list of all the questions & answers
+                    selected_keys = [
+                        "type",
+                        "title",
+                        "url",
+                        "description",
+                    ]
+                    cleaned = []
+                    for q in results:
+                        cleaned.append(
+                            {k: v for k, v in q.items() if k in selected_keys}
+                        )
+                else:
+                    cleaned = []
+
+                clean_response.append(cleaned)
+
+        return {"query": query, "top_k": clean_response}
+
+
+class WolframAlphaTool(SingleMessageBuiltinTool):
+    def __init__(self, api_key: str) -> None:
+        self.api_key = api_key
+        self.url = "https://api.wolframalpha.com/v2/query"
+
+    def get_name(self) -> str:
+        return BuiltinTool.wolfram_alpha.value
+
+    async def run_impl(self, query: str) -> str:
+        params = {
+            "input": query,
+            "appid": self.api_key,
+            "format": "plaintext",
+            "output": "json",
+        }
+        response = requests.get(
+            self.url,
+            params=params,
+        )
+
+        return json.dumps(self._clean_wolfram_alpha_response(response.json()))
+
+    def _clean_wolfram_alpha_response(self, wa_response):
+        remove = {
+            "queryresult": [
+                "datatypes",
+                "error",
+                "timedout",
+                "timedoutpods",
+                "numpods",
+                "timing",
+                "parsetiming",
+                "parsetimedout",
+                "recalculate",
+                "id",
+                "host",
+                "server",
+                "related",
+                "version",
+                {
+                    "pods": [
+                        "scanner",
+                        "id",
+                        "error",
+                        "expressiontypes",
+                        "states",
+                        "infos",
+                        "position",
+                        "numsubpods",
+                    ]
+                },
+                "assumptions",
+            ],
+        }
+        for main_key in remove:
+            for key_to_remove in remove[main_key]:
+                try:
+                    if key_to_remove == "assumptions":
+                        if "assumptions" in wa_response[main_key]:
+                            del wa_response[main_key][key_to_remove]
+                    if isinstance(key_to_remove, dict):
+                        for sub_key in key_to_remove:
+                            if sub_key == "pods":
+                                for i in range(len(wa_response[main_key][sub_key])):
+                                    if (
+                                        wa_response[main_key][sub_key][i]["title"]
+                                        == "Result"
+                                    ):
+                                        del wa_response[main_key][sub_key][i + 1 :]
+                                        break
+                            sub_items = wa_response[main_key][sub_key]
+                            for i in range(len(sub_items)):
+                                for sub_key_to_remove in key_to_remove[sub_key]:
+                                    if sub_key_to_remove in sub_items[i]:
+                                        del sub_items[i][sub_key_to_remove]
+                    elif key_to_remove in wa_response[main_key]:
+                        del wa_response[main_key][key_to_remove]
+                except KeyError:
+                    pass
+        return wa_response
+
+
+class CodeInterpreterTool(BaseTool):
+    def __init__(self) -> None:
+        ctx = CodeExecutionContext(
+            matplotlib_dump_dir=tempfile.mkdtemp(),
+        )
+        self.code_executor = CodeExecutor(ctx)
+
+    def get_name(self) -> str:
+        return BuiltinTool.code_interpreter.value
+
+    async def run(self, messages: List[CompletionMessage]) -> List[ToolResponseMessage]:
+        message = messages[0]
+        assert len(message.tool_calls) == 1, "Expected a single tool call"
+
+        tool_call = messages[0].tool_calls[0]
+        script = tool_call.arguments["code"]
+
+        req = CodeExecutionRequest(scripts=[script])
+        res = self.code_executor.execute(req)
+
+        pieces = [res["process_status"]]
+        for out_type in ["stdout", "stderr"]:
+            res_out = res[out_type]
+            if res_out != "":
+                pieces.extend([f"[{out_type}]", res_out, f"[/{out_type}]"])
+                if out_type == "stderr":
+                    cprint(f"ipython tool error: ↓\n{res_out}", color="red")
+
+        message = ToolResponseMessage(
+            call_id=tool_call.call_id,
+            tool_name=tool_call.tool_name,
+            content="\n".join(pieces),
+        )
+        return [message]
--- a/llama_stack/providers/inline/meta_reference/agents/tools/ipython_tool/init.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tools/ipython_tool/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/inline/meta_reference/agents/tools/ipython_tool/code_env_prefix.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tools/ipython_tool/code_env_prefix.py
@ -0,0 +1,133 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import errno
+
+# Disabling potentially dangerous functions
+import os as _os
+from functools import partial
+
+os_funcs_to_disable = [
+    "kill",
+    "system",
+    "putenv",
+    "remove",
+    "removedirs",
+    "rmdir",
+    "fchdir",
+    "setuid",
+    "fork",
+    "forkpty",
+    "killpg",
+    "rename",
+    "renames",
+    "truncate",
+    "replace",
+    # "unlink",  # Commenting as this was blocking matpltlib from rendering plots correctly
+    "fchmod",
+    "fchown",
+    "chmod",
+    "chown",
+    "chroot",
+    "fchdir",
+    "lchflags",
+    "lchmod",
+    "lchown",
+    "chdir",
+]
+
+
+def call_not_allowed(*args, **kwargs):
+    raise OSError(errno.EPERM, "Call are not permitted in this environment")
+
+
+for func_name in os_funcs_to_disable:
+    if hasattr(_os, func_name):
+        setattr(_os, func_name, partial(call_not_allowed, _func_name=f"os.{func_name}"))
+
+import shutil as _shutil
+
+for func_name in ["rmtree", "move", "chown"]:
+    if hasattr(_shutil, func_name):
+        setattr(
+            _shutil,
+            func_name,
+            partial(call_not_allowed, _func_name=f"shutil.{func_name}"),
+        )
+
+import subprocess as _subprocess
+
+
+def popen_not_allowed(*args, **kwargs):
+    raise _subprocess.CalledProcessError(
+        -1,
+        args[0] if args else "unknown",
+        stderr="subprocess.Popen is not allowed in this environment",
+    )
+
+
+_subprocess.Popen = popen_not_allowed
+
+
+import atexit as _atexit
+import builtins as _builtins
+import io as _io
+import json as _json
+import sys as _sys
+
+# NB! The following "unused" imports crucial, make sure not not to remove
+# them with linters - they're used in code_execution.py
+from contextlib import (  # noqa
+    contextmanager as _contextmanager,
+    redirect_stderr as _redirect_stderr,
+    redirect_stdout as _redirect_stdout,
+)
+from multiprocessing.connection import Connection as _Connection
+
+# Mangle imports to avoid polluting model execution namespace.
+
+_IO_SINK = _io.StringIO()
+_NETWORK_TIMEOUT = 5
+_NETWORK_CONNECTIONS = None
+
+
+def _open_connections():
+    global _NETWORK_CONNECTIONS
+    if _NETWORK_CONNECTIONS is not None:
+        # Ensure connections only opened once.
+        return _NETWORK_CONNECTIONS
+    req_w_fd, resp_r_fd = _sys.argv[1], _sys.argv[2]
+    req_con = _Connection(int(req_w_fd), readable=False)
+    resp_con = _Connection(int(resp_r_fd), writable=False)
+    _NETWORK_CONNECTIONS = (req_con, resp_con)
+    return _NETWORK_CONNECTIONS
+
+
+_builtins._open_connections = _open_connections
+
+
+@_atexit.register
+def _close_connections():
+    global _NETWORK_CONNECTIONS
+    if _NETWORK_CONNECTIONS is None:
+        return
+    for con in _NETWORK_CONNECTIONS:
+        con.close()
+    del _NETWORK_CONNECTIONS
+
+
+def _network_call(request):
+    # NOTE: We communicate with the parent process in json, encoded
+    # in raw bytes. We do this because native send/recv methods use
+    # pickle which involves execution of arbitrary code.
+    _open_connections()
+    req_con, resp_con = _NETWORK_CONNECTIONS
+
+    req_con.send_bytes(_json.dumps(request).encode("utf-8"))
+    if resp_con.poll(timeout=_NETWORK_TIMEOUT) is None:
+        raise Exception(f"Network request timed out: {_json.dumps(request)}")
+    else:
+        return _json.loads(resp_con.recv_bytes().decode("utf-8"))
--- a/llama_stack/providers/inline/meta_reference/agents/tools/ipython_tool/code_execution.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tools/ipython_tool/code_execution.py
@ -0,0 +1,256 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+import json
+import multiprocessing
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import textwrap
+import time
+from dataclasses import dataclass
+from datetime import datetime
+from io import BytesIO
+from pathlib import Path
+from typing import List
+
+from PIL import Image
+
+from .utils import get_code_env_prefix
+
+TOOLS_ATTACHMENT_KEY = "__tools_attachment__"
+TOOLS_ATTACHMENT_KEY_REGEX = re.compile(r"__tools_attachment__=(\{.*?\})")
+
+DIRNAME = Path(__file__).parent
+
+CODE_EXEC_TIMEOUT = 20
+CODE_ENV_PREFIX = get_code_env_prefix()
+
+STDOUTERR_SINK_WRAPPER_TEMPLATE = """\
+with _redirect_stdout(_IO_SINK), _redirect_stderr(_IO_SINK):
+{code}\
+"""
+
+TRYEXCEPT_WRAPPER_TEMPLATE = """\
+try:
+{code}
+except:
+    pass\
+"""
+
+
+def generate_bwrap_command(bind_dirs: List[str]) -> str:
+    """
+    Generate the bwrap command string for binding all
+    directories in the current directory read-only.
+    """
+    bwrap_args = ""
+    bwrap_args += "--ro-bind / / "
+    # Add the --dev flag to mount device files
+    bwrap_args += "--dev /dev "
+    for d in bind_dirs:
+        bwrap_args += f"--bind {d} {d} "
+
+    # Add the --unshare-all flag to isolate the sandbox from the rest of the system
+    bwrap_args += "--unshare-all "
+    # Add the --die-with-parent flag to ensure the child process dies when bwrap's parent dies
+    bwrap_args += "--die-with-parent "
+    return bwrap_args
+
+
+@dataclass
+class CodeExecutionContext:
+    matplotlib_dump_dir: str
+    use_proxy: bool = False
+
+
+@dataclass
+class CodeExecutionRequest:
+    scripts: List[str]
+    only_last_cell_stdouterr: bool = True
+    only_last_cell_fail: bool = True
+    seed: int = 0
+    strip_fpaths_in_stderr: bool = True
+
+
+class CodeExecutor:
+    def __init__(self, context: CodeExecutionContext):
+        self.context = context
+
+    def execute(self, req: CodeExecutionRequest) -> dict:
+        scripts = req.scripts
+        for i in range(len(scripts) - 1):
+            if req.only_last_cell_stdouterr:
+                scripts[i] = STDOUTERR_SINK_WRAPPER_TEMPLATE.format(
+                    code=textwrap.indent(scripts[i], " " * 4)
+                )
+            if req.only_last_cell_fail:
+                scripts[i] = TRYEXCEPT_WRAPPER_TEMPLATE.format(
+                    code=textwrap.indent(scripts[i], " " * 4)
+                )
+
+        # Seeds prefix:
+        seed = req.seed
+        seeds_prefix = f"""\
+def _set_seeds():
+    import random
+    random.seed({seed})
+    import numpy as np
+    np.random.seed({seed})
+_set_seeds()\
+"""
+
+        script = "\n\n".join([seeds_prefix] + [CODE_ENV_PREFIX] + scripts)
+        with tempfile.TemporaryDirectory() as dpath:
+            bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath])
+            cmd = [*bwrap_prefix.split(), sys.executable, "-c", script]
+            code_fpath = os.path.join(dpath, "code.py")
+            with open(code_fpath, "w") as f:
+                f.write(script)
+
+            try:
+                python_path = os.environ.get("PYTHONPATH", "")
+                env = dict(
+                    os.environ,
+                    PYTHONHASHSEED=str(seed),
+                    MPLCONFIGDIR=dpath,
+                    MPLBACKEND="module://matplotlib_custom_backend",
+                    PYTHONPATH=f"{DIRNAME}:{python_path}",
+                )
+                stdout, stderr, returncode = do_subprocess(
+                    cmd=cmd,
+                    env=env,
+                    ctx=self.context,
+                )
+
+                stderr = stderr.strip()
+                if req.strip_fpaths_in_stderr:
+                    pattern = r'File "([^"]+)", line (\d+)'
+                    stderr = re.sub(pattern, r"line \2", stderr)
+
+                return {
+                    "process_status": "completed",
+                    "returncode": returncode,
+                    "stdout": stdout.strip(),
+                    "stderr": stderr,
+                }
+
+            except subprocess.TimeoutExpired:
+                return {
+                    "process_status": "timeout",
+                    "stdout": "Timed out",
+                    "stderr": "Timed out",
+                }
+
+            except Exception as e:
+                return {
+                    "process_status": "error",
+                    "error_type": type(e).__name__,
+                    "stderr": str(e),
+                    "stdout": str(e),
+                }
+
+
+def process_matplotlib_response(response, matplotlib_dump_dir: str):
+    image_data = response["image_data"]
+    # Convert the base64 string to a bytes object
+    images = [base64.b64decode(d["image_base64"]) for d in image_data]
+    # Create a list of PIL images from the bytes objects
+    images = [Image.open(BytesIO(img)) for img in images]
+    # Create a list of image paths
+    image_paths = []
+    for i, img in enumerate(images):
+        # create new directory for each day to better organize data:
+        dump_dname = datetime.today().strftime("%Y-%m-%d")
+        dump_dpath = Path(matplotlib_dump_dir, dump_dname)
+        dump_dpath.mkdir(parents=True, exist_ok=True)
+        # save image into a file
+        dump_fname = f"matplotlib_{str(time.time()).replace('.', '_')}_{i}.png"
+        dump_fpath = dump_dpath / dump_fname
+        img.save(dump_fpath, "PNG")
+        image_paths.append(str(dump_fpath))
+
+    # this is kind of convoluted, we send back this response to the subprocess which
+    # prints it out
+    info = {
+        "filepath": str(image_paths[-1]),
+        "mimetype": "image/png",
+    }
+    return f"{TOOLS_ATTACHMENT_KEY}={json.dumps(info)}"
+
+
+def execute_subprocess_request(request, ctx: CodeExecutionContext):
+    "Route requests from the subprocess (via network Pipes) to the internet/tools."
+    if request["type"] == "matplotlib":
+        return process_matplotlib_response(request, ctx.matplotlib_dump_dir)
+    else:
+        raise Exception(f'Unrecognised network request type: {request["type"]}')
+
+
+def do_subprocess(*, cmd: list, env: dict, ctx: CodeExecutionContext):
+    # Create Pipes to be used for any external tool/network requests.
+    req_r, req_w = multiprocessing.Pipe(duplex=False)
+    resp_r, resp_w = multiprocessing.Pipe(duplex=False)
+
+    cmd += [str(req_w.fileno()), str(resp_r.fileno())]
+    proc = subprocess.Popen(
+        cmd,
+        pass_fds=(req_w.fileno(), resp_r.fileno()),
+        text=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        close_fds=True,
+        env=env,
+    )
+
+    # Close unnecessary fds.
+    req_w.close()
+    resp_r.close()
+
+    pipe_close = False
+    done_read = False
+    start = time.monotonic()
+    while proc.poll() is None and not pipe_close:
+        if req_r.poll(0.1):
+            # NB: Python pipe semantics for poll and recv mean that
+            # poll() returns True is a pipe is closed.
+            # CF old school PEP from '09
+            #  https://bugs.python.org/issue5573
+            try:
+                request = json.loads(req_r.recv_bytes().decode("utf-8"))
+                response = execute_subprocess_request(request, ctx)
+
+                resp_w.send_bytes(json.dumps(response).encode("utf-8"))
+            except EOFError:
+                # The request pipe is closed - set a marker to exit
+                # after the next attempt at reading stdout/stderr.
+                pipe_close = True
+
+            try:
+                # If lots has been printed, pipe might be full but
+                # proc cannot exit until all the stdout/stderr
+                # been written/read.
+                stdout, stderr = proc.communicate(timeout=0.3)
+                done_read = True
+            except subprocess.TimeoutExpired:
+                # The program has not terminated. Ignore it, there
+                # may be more network/tool requests.
+                continue
+        if time.monotonic() - start > CODE_EXEC_TIMEOUT:
+            proc.terminate()
+            raise subprocess.TimeoutExpired(cmd, CODE_EXEC_TIMEOUT)
+
+    if not done_read:
+        # Solve race condition where process terminates before
+        # we hit the while loop.
+        stdout, stderr = proc.communicate(timeout=0.3)
+
+    resp_w.close()
+    req_r.close()
+    return stdout, stderr, proc.returncode
--- a/llama_stack/providers/inline/meta_reference/agents/tools/ipython_tool/matplotlib_custom_backend.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tools/ipython_tool/matplotlib_custom_backend.py
@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+A custom Matplotlib backend that overrides the show method to return image bytes.
+"""
+
+import base64
+import io
+import json as _json
+
+import matplotlib
+from matplotlib.backend_bases import FigureManagerBase
+
+# Import necessary components from Matplotlib
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+
+
+class CustomFigureCanvas(FigureCanvasAgg):
+    def show(self):
+        # Save the figure to a BytesIO object
+        buf = io.BytesIO()
+        self.print_png(buf)
+        image_bytes = buf.getvalue()
+        buf.close()
+        return image_bytes
+
+
+class CustomFigureManager(FigureManagerBase):
+    def __init__(self, canvas, num):
+        super().__init__(canvas, num)
+
+
+# Mimic module initialization that integrates with the Matplotlib backend system
+def _create_figure_manager(num, *args, **kwargs):
+    """
+    Create a custom figure manager instance.
+    """
+    FigureClass = kwargs.pop("FigureClass", None)  # noqa: N806
+    if FigureClass is None:
+        from matplotlib.figure import Figure
+
+        FigureClass = Figure  # noqa: N806
+    fig = FigureClass(*args, **kwargs)
+    canvas = CustomFigureCanvas(fig)
+    manager = CustomFigureManager(canvas, num)
+    return manager
+
+
+def show():
+    """
+    Handle all figures and potentially return their images as bytes.
+
+    This function iterates over all figures registered with the custom backend,
+    renders them as images in bytes format, and could return a list of bytes objects,
+    one for each figure, or handle them as needed.
+    """
+    image_data = []
+    for manager in matplotlib._pylab_helpers.Gcf.get_all_fig_managers():
+        # Get the figure from the manager
+        fig = manager.canvas.figure
+        buf = io.BytesIO()  # Create a buffer for the figure
+        fig.savefig(buf, format="png")  # Save the figure to the buffer in PNG format
+        buf.seek(0)  # Go to the beginning of the buffer
+        image_bytes = buf.getvalue()  # Retrieve bytes value
+        image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+        image_data.append({"image_base64": image_base64})
+        buf.close()
+
+    req_con, resp_con = _open_connections()
+
+    _json_dump = _json.dumps(
+        {
+            "type": "matplotlib",
+            "image_data": image_data,
+        }
+    )
+    req_con.send_bytes(_json_dump.encode("utf-8"))
+    resp = _json.loads(resp_con.recv_bytes().decode("utf-8"))
+    print(resp)
+
+
+FigureCanvas = CustomFigureCanvas
+FigureManager = CustomFigureManager
--- a/llama_stack/providers/inline/meta_reference/agents/tools/ipython_tool/utils.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tools/ipython_tool/utils.py
@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+
+DIR = os.path.dirname(os.path.realpath(__file__))
+CODE_ENV_PREFIX_FILE = os.path.join(DIR, "code_env_prefix.py")
+CODE_ENV_PREFIX = None
+
+
+def get_code_env_prefix() -> str:
+    global CODE_ENV_PREFIX
+
+    if CODE_ENV_PREFIX is None:
+        with open(CODE_ENV_PREFIX_FILE, "r") as f:
+            CODE_ENV_PREFIX = f.read()
+
+    return CODE_ENV_PREFIX
--- a/llama_stack/providers/inline/meta_reference/agents/tools/safety.py
+++ b/llama_stack/providers/inline/meta_reference/agents/tools/safety.py
@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.apis.inference import Message
+from llama_stack.apis.safety import *  # noqa: F403
+
+from llama_stack.providers.inline.meta_reference.agents.safety import ShieldRunnerMixin
+
+from .builtin import BaseTool
+
+
+class SafeTool(BaseTool, ShieldRunnerMixin):
+    """A tool that makes other tools safety enabled"""
+
+    def __init__(
+        self,
+        tool: BaseTool,
+        safety_api: Safety,
+        input_shields: List[str] = None,
+        output_shields: List[str] = None,
+    ):
+        self._tool = tool
+        ShieldRunnerMixin.__init__(
+            self, safety_api, input_shields=input_shields, output_shields=output_shields
+        )
+
+    def get_name(self) -> str:
+        return self._tool.get_name()
+
+    async def run(self, messages: List[Message]) -> List[Message]:
+        if self.input_shields:
+            await self.run_multiple_shields(messages, self.input_shields)
+        # run the underlying tool
+        res = await self._tool.run(messages)
+        if self.output_shields:
+            await self.run_multiple_shields(messages, self.output_shields)
+
+        return res
--- a/llama_stack/providers/inline/meta_reference/codeshield/init.py
+++ b/llama_stack/providers/inline/meta_reference/codeshield/init.py
@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import CodeShieldConfig
+
+
+async def get_provider_impl(config: CodeShieldConfig, deps):
+    from .code_scanner import MetaReferenceCodeScannerSafetyImpl
+
+    impl = MetaReferenceCodeScannerSafetyImpl(config, deps)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/meta_reference/codeshield/code_scanner.py
+++ b/llama_stack/providers/inline/meta_reference/codeshield/code_scanner.py
@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, List
+
+from llama_models.llama3.api.datatypes import interleaved_text_media_as_str, Message
+from termcolor import cprint
+
+from .config import CodeScannerConfig
+
+from llama_stack.apis.safety import *  # noqa: F403
+
+
+class MetaReferenceCodeScannerSafetyImpl(Safety):
+    def __init__(self, config: CodeScannerConfig, deps) -> None:
+        self.config = config
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def register_shield(self, shield: ShieldDef) -> None:
+        if shield.shield_type != ShieldType.code_scanner.value:
+            raise ValueError(f"Unsupported safety shield type: {shield.shield_type}")
+
+    async def run_shield(
+        self,
+        shield_type: str,
+        messages: List[Message],
+        params: Dict[str, Any] = None,
+    ) -> RunShieldResponse:
+        shield_def = await self.shield_store.get_shield(shield_type)
+        if not shield_def:
+            raise ValueError(f"Unknown shield {shield_type}")
+
+        from codeshield.cs import CodeShield
+
+        text = "\n".join([interleaved_text_media_as_str(m.content) for m in messages])
+        cprint(f"Running CodeScannerShield on {text[50:]}", color="magenta")
+        result = await CodeShield.scan_code(text)
+
+        violation = None
+        if result.is_insecure:
+            violation = SafetyViolation(
+                violation_level=(ViolationLevel.ERROR),
+                user_message="Sorry, I found security concerns in the code.",
+                metadata={
+                    "violation_type": ",".join(
+                        [issue.pattern_id for issue in result.issues_found]
+                    )
+                },
+            )
+        return RunShieldResponse(violation=violation)
--- a/llama_stack/providers/inline/meta_reference/codeshield/config.py
+++ b/llama_stack/providers/inline/meta_reference/codeshield/config.py
@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+
+class CodeShieldConfig(BaseModel):
+    pass
--- a/llama_stack/providers/inline/meta_reference/datasetio/init.py
+++ b/llama_stack/providers/inline/meta_reference/datasetio/init.py
@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import MetaReferenceDatasetIOConfig
+
+
+async def get_provider_impl(
+    config: MetaReferenceDatasetIOConfig,
+    _deps,
+):
+    from .datasetio import MetaReferenceDatasetIOImpl
+
+    impl = MetaReferenceDatasetIOImpl(config)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/meta_reference/datasetio/config.py
+++ b/llama_stack/providers/inline/meta_reference/datasetio/config.py
@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.datasetio import *  # noqa: F401, F403
+
+
+class MetaReferenceDatasetIOConfig(BaseModel): ...
--- a/llama_stack/providers/inline/meta_reference/datasetio/datasetio.py
+++ b/llama_stack/providers/inline/meta_reference/datasetio/datasetio.py
@ -0,0 +1,158 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import io
+from typing import List, Optional
+
+import pandas
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+
+from llama_stack.apis.datasetio import *  # noqa: F403
+import base64
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from urllib.parse import unquote
+
+from llama_stack.providers.datatypes import DatasetsProtocolPrivate
+from llama_stack.providers.utils.memory.vector_store import parse_data_url
+
+from .config import MetaReferenceDatasetIOConfig
+
+
+class BaseDataset(ABC):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    @abstractmethod
+    def __len__(self) -> int:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __getitem__(self, idx):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def load(self):
+        raise NotImplementedError()
+
+
+@dataclass
+class DatasetInfo:
+    dataset_def: DatasetDef
+    dataset_impl: BaseDataset
+
+
+class PandasDataframeDataset(BaseDataset):
+    def __init__(self, dataset_def: DatasetDef, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.dataset_def = dataset_def
+        self.df = None
+
+    def __len__(self) -> int:
+        assert self.df is not None, "Dataset not loaded. Please call .load() first"
+        return len(self.df)
+
+    def __getitem__(self, idx):
+        assert self.df is not None, "Dataset not loaded. Please call .load() first"
+        if isinstance(idx, slice):
+            return self.df.iloc[idx].to_dict(orient="records")
+        else:
+            return self.df.iloc[idx].to_dict()
+
+    def _validate_dataset_schema(self, df) -> pandas.DataFrame:
+        # note that we will drop any columns in dataset that are not in the schema
+        df = df[self.dataset_def.dataset_schema.keys()]
+        # check all columns in dataset schema are present
+        assert len(df.columns) == len(self.dataset_def.dataset_schema)
+        # TODO: type checking against column types in dataset schema
+        return df
+
+    def load(self) -> None:
+        if self.df is not None:
+            return
+
+        # TODO: more robust support w/ data url
+        if self.dataset_def.url.uri.endswith(".csv"):
+            df = pandas.read_csv(self.dataset_def.url.uri)
+        elif self.dataset_def.url.uri.endswith(".xlsx"):
+            df = pandas.read_excel(self.dataset_def.url.uri)
+        elif self.dataset_def.url.uri.startswith("data:"):
+            parts = parse_data_url(self.dataset_def.url.uri)
+            data = parts["data"]
+            if parts["is_base64"]:
+                data = base64.b64decode(data)
+            else:
+                data = unquote(data)
+                encoding = parts["encoding"] or "utf-8"
+                data = data.encode(encoding)
+
+            mime_type = parts["mimetype"]
+            mime_category = mime_type.split("/")[0]
+            data_bytes = io.BytesIO(data)
+
+            if mime_category == "text":
+                df = pandas.read_csv(data_bytes)
+            else:
+                df = pandas.read_excel(data_bytes)
+        else:
+            raise ValueError(f"Unsupported file type: {self.dataset_def.url}")
+
+        self.df = self._validate_dataset_schema(df)
+
+
+class MetaReferenceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
+    def __init__(self, config: MetaReferenceDatasetIOConfig) -> None:
+        self.config = config
+        # local registry for keeping track of datasets within the provider
+        self.dataset_infos = {}
+
+    async def initialize(self) -> None: ...
+
+    async def shutdown(self) -> None: ...
+
+    async def register_dataset(
+        self,
+        dataset_def: DatasetDef,
+    ) -> None:
+        dataset_impl = PandasDataframeDataset(dataset_def)
+        self.dataset_infos[dataset_def.identifier] = DatasetInfo(
+            dataset_def=dataset_def,
+            dataset_impl=dataset_impl,
+        )
+
+    async def list_datasets(self) -> List[DatasetDef]:
+        return [i.dataset_def for i in self.dataset_infos.values()]
+
+    async def get_rows_paginated(
+        self,
+        dataset_id: str,
+        rows_in_page: int,
+        page_token: Optional[str] = None,
+        filter_condition: Optional[str] = None,
+    ) -> PaginatedRowsResult:
+        dataset_info = self.dataset_infos.get(dataset_id)
+        dataset_info.dataset_impl.load()
+
+        if page_token and not page_token.isnumeric():
+            raise ValueError("Invalid page_token")
+
+        if page_token is None or len(page_token) == 0:
+            next_page_token = 0
+        else:
+            next_page_token = int(page_token)
+
+        start = next_page_token
+        if rows_in_page == -1:
+            end = len(dataset_info.dataset_impl)
+        else:
+            end = min(start + rows_in_page, len(dataset_info.dataset_impl))
+
+        rows = dataset_info.dataset_impl[start:end]
+
+        return PaginatedRowsResult(
+            rows=rows,
+            total_count=len(rows),
+            next_page_token=str(end),
+        )
--- a/llama_stack/providers/inline/meta_reference/eval/init.py
+++ b/llama_stack/providers/inline/meta_reference/eval/init.py
@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Dict
+
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
+from .config import MetaReferenceEvalConfig
+
+
+async def get_provider_impl(
+    config: MetaReferenceEvalConfig,
+    deps: Dict[Api, ProviderSpec],
+):
+    from .eval import MetaReferenceEvalImpl
+
+    impl = MetaReferenceEvalImpl(
+        config,
+        deps[Api.datasetio],
+        deps[Api.datasets],
+        deps[Api.scoring],
+        deps[Api.inference],
+    )
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/meta_reference/eval/config.py
+++ b/llama_stack/providers/inline/meta_reference/eval/config.py
@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.eval import *  # noqa: F401, F403
+
+
+class MetaReferenceEvalConfig(BaseModel): ...
--- a/llama_stack/providers/inline/meta_reference/eval/eval.py
+++ b/llama_stack/providers/inline/meta_reference/eval/eval.py
@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from enum import Enum
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+
+from llama_stack.apis.common.type_system import *  # noqa: F403
+from llama_stack.apis.common.job_types import Job
+from llama_stack.apis.datasetio import DatasetIO
+from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.eval import Eval, EvalCandidate, EvaluateResponse, JobStatus
+from llama_stack.apis.inference import Inference
+from llama_stack.apis.scoring import Scoring
+
+from .config import MetaReferenceEvalConfig
+
+
+class ColumnName(Enum):
+    input_query = "input_query"
+    expected_answer = "expected_answer"
+    chat_completion_input = "chat_completion_input"
+    completion_input = "completion_input"
+    generated_answer = "generated_answer"
+
+
+class MetaReferenceEvalImpl(Eval):
+    def __init__(
+        self,
+        config: MetaReferenceEvalConfig,
+        datasetio_api: DatasetIO,
+        datasets_api: Datasets,
+        scoring_api: Scoring,
+        inference_api: Inference,
+    ) -> None:
+        self.config = config
+        self.datasetio_api = datasetio_api
+        self.datasets_api = datasets_api
+        self.scoring_api = scoring_api
+        self.inference_api = inference_api
+
+        # TODO: assume sync job, will need jobs API for async scheduling
+        self.jobs = {}
+
+    async def initialize(self) -> None: ...
+
+    async def shutdown(self) -> None: ...
+
+    async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
+        dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
+        if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
+            raise ValueError(f"Dataset {dataset_id} does not have a schema defined.")
+
+        expected_schemas = [
+            {
+                ColumnName.input_query.value: StringType(),
+                ColumnName.expected_answer.value: StringType(),
+                ColumnName.chat_completion_input.value: ChatCompletionInputType(),
+            },
+            {
+                ColumnName.input_query.value: StringType(),
+                ColumnName.expected_answer.value: StringType(),
+                ColumnName.completion_input.value: CompletionInputType(),
+            },
+        ]
+
+        if dataset_def.dataset_schema not in expected_schemas:
+            raise ValueError(
+                f"Dataset {dataset_id} does not have a correct input schema in {expected_schemas}"
+            )
+
+    async def evaluate_batch(
+        self,
+        dataset_id: str,
+        candidate: EvalCandidate,
+        scoring_functions: List[str],
+    ) -> Job:
+        await self.validate_eval_input_dataset_schema(dataset_id=dataset_id)
+        all_rows = await self.datasetio_api.get_rows_paginated(
+            dataset_id=dataset_id,
+            rows_in_page=-1,
+        )
+        res = await self.evaluate(
+            input_rows=all_rows.rows,
+            candidate=candidate,
+            scoring_functions=scoring_functions,
+        )
+
+        # TODO: currently needs to wait for generation before returning
+        # need job scheduler queue (ray/celery) w/ jobs api
+        job_id = str(len(self.jobs))
+        self.jobs[job_id] = res
+        return Job(job_id=job_id)
+
+    async def evaluate(
+        self,
+        input_rows: List[Dict[str, Any]],
+        candidate: EvalCandidate,
+        scoring_functions: List[str],
+    ) -> EvaluateResponse:
+        if candidate.type == "agent":
+            raise NotImplementedError(
+                "Evaluation with generation has not been implemented for agents"
+            )
+        assert (
+            candidate.sampling_params.max_tokens is not None
+        ), "SamplingParams.max_tokens must be provided"
+
+        generations = []
+        for x in input_rows:
+            if ColumnName.completion_input.value in x:
+                input_content = eval(str(x[ColumnName.completion_input.value]))
+                response = await self.inference_api.completion(
+                    model=candidate.model,
+                    content=input_content,
+                    sampling_params=candidate.sampling_params,
+                )
+                generations.append(
+                    {
+                        ColumnName.generated_answer.value: response.completion_message.content
+                    }
+                )
+            elif ColumnName.chat_completion_input.value in x:
+                input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
+                input_messages = [UserMessage(**x) for x in input_messages]
+                messages = []
+                if candidate.system_message:
+                    messages.append(candidate.system_message)
+                messages += input_messages
+                response = await self.inference_api.chat_completion(
+                    model=candidate.model,
+                    messages=messages,
+                    sampling_params=candidate.sampling_params,
+                )
+                generations.append(
+                    {
+                        ColumnName.generated_answer.value: response.completion_message.content
+                    }
+                )
+            else:
+                raise ValueError("Invalid input row")
+
+        # scoring with generated_answer
+        score_input_rows = [
+            input_r | generated_r
+            for input_r, generated_r in zip(input_rows, generations)
+        ]
+
+        score_response = await self.scoring_api.score(
+            input_rows=score_input_rows, scoring_functions=scoring_functions
+        )
+
+        return EvaluateResponse(generations=generations, scores=score_response.results)
+
+    async def job_status(self, job_id: str) -> Optional[JobStatus]:
+        if job_id in self.jobs:
+            return JobStatus.completed
+
+        return None
+
+    async def job_cancel(self, job_id: str) -> None:
+        raise NotImplementedError("Job cancel is not implemented yet")
+
+    async def job_result(self, job_id: str) -> EvaluateResponse:
+        status = await self.job_status(job_id)
+        if not status or status != JobStatus.completed:
+            raise ValueError(f"Job is not completed, Status: {status.value}")
+
+        return self.jobs[job_id]
--- a/llama_stack/providers/inline/meta_reference/inference/init.py
+++ b/llama_stack/providers/inline/meta_reference/inference/init.py
@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Union
+
+from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
+
+
+async def get_provider_impl(
+    config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig],
+    _deps,
+):
+    from .inference import MetaReferenceInferenceImpl
+
+    impl = MetaReferenceInferenceImpl(config)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/meta_reference/inference/config.py
+++ b/llama_stack/providers/inline/meta_reference/inference/config.py
@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Optional
+
+from llama_models.datatypes import *  # noqa: F403
+from llama_models.sku_list import resolve_model
+
+from llama_stack.apis.inference import *  # noqa: F401, F403
+from pydantic import BaseModel, Field, field_validator
+
+from llama_stack.providers.utils.inference import supported_inference_models
+
+
+class MetaReferenceInferenceConfig(BaseModel):
+    model: str = Field(
+        default="Llama3.2-3B-Instruct",
+        description="Model descriptor from `llama model list`",
+    )
+    torch_seed: Optional[int] = None
+    max_seq_len: int = 4096
+    max_batch_size: int = 1
+
+    # when this is False, we assume that the distributed process group is setup by someone
+    # outside of this code (e.g., when run inside `torchrun`). that is useful for clients
+    # (including our testing code) who might be using llama-stack as a library.
+    create_distributed_process_group: bool = True
+
+    # By default, the implementation will look at ~/.llama/checkpoints/<model> but you
+    # can override by specifying the directory explicitly
+    checkpoint_dir: Optional[str] = None
+
+    @field_validator("model")
+    @classmethod
+    def validate_model(cls, model: str) -> str:
+        permitted_models = supported_inference_models()
+        if model not in permitted_models:
+            model_list = "\n\t".join(permitted_models)
+            raise ValueError(
+                f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]"
+            )
+        return model
+
+    @property
+    def model_parallel_size(self) -> int:
+        resolved = resolve_model(self.model)
+        return resolved.pth_file_count
+
+
+class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig):
+    quantization: QuantizationConfig
--- a/llama_stack/providers/inline/meta_reference/inference/generation.py
+++ b/llama_stack/providers/inline/meta_reference/inference/generation.py
@ -0,0 +1,484 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import json
+import math
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Generator, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from fairscale.nn.model_parallel.initialize import (
+    get_model_parallel_rank,
+    initialize_model_parallel,
+    model_parallel_is_initialized,
+)
+from llama_models.llama3.api.args import ModelArgs
+from llama_models.llama3.api.chat_format import ChatFormat, ModelInput
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.llama3.reference_impl.model import Transformer
+from llama_models.llama3.reference_impl.multimodal.model import (
+    CrossAttentionTransformer,
+)
+from llama_models.sku_list import resolve_model
+from pydantic import BaseModel
+from termcolor import cprint
+
+from llama_stack.apis.inference import *  # noqa: F403
+
+from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
+
+from llama_stack.distribution.utils.model_utils import model_local_dir
+from llama_stack.providers.utils.inference.prompt_adapter import (
+    augment_content_with_response_format_prompt,
+    chat_completion_request_to_messages,
+)
+
+from .config import (
+    Fp8QuantizationConfig,
+    Int4QuantizationConfig,
+    MetaReferenceInferenceConfig,
+    MetaReferenceQuantizedInferenceConfig,
+)
+
+
+def model_checkpoint_dir(model) -> str:
+    checkpoint_dir = Path(model_local_dir(model.descriptor()))
+
+    paths = [Path(checkpoint_dir / f"consolidated.{ext}") for ext in ["pth", "00.pth"]]
+    if not any(p.exists() for p in paths):
+        checkpoint_dir = checkpoint_dir / "original"
+
+    assert checkpoint_dir.exists(), (
+        f"Could not find checkpoints in: {model_local_dir(model.descriptor())}. "
+        f"Please download model using `llama download --model-id {model.descriptor()}`"
+    )
+    return str(checkpoint_dir)
+
+
+class TokenResult(BaseModel):
+    token: int
+    text: str
+    logprobs: Optional[List[float]] = None
+
+
+class Llama:
+    @staticmethod
+    def build(
+        config: Union[
+            MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
+        ],
+    ):
+        """
+        Build a Llama instance by initializing and loading a model checkpoint.
+
+        Note:
+            This method initializes the distributed process group, sets the device to CUDA,
+            and loads the pre-trained model and tokenizer.
+        """
+        model = resolve_model(config.model)
+
+        if not torch.distributed.is_initialized():
+            torch.distributed.init_process_group("nccl")
+
+        model_parallel_size = config.model_parallel_size
+
+        if not model_parallel_is_initialized():
+            initialize_model_parallel(model_parallel_size)
+
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        torch.cuda.set_device(local_rank)
+
+        # seed must be the same in all processes
+        if config.torch_seed is not None:
+            torch.manual_seed(config.torch_seed)
+
+        if local_rank > 0:
+            sys.stdout = open(os.devnull, "w")
+
+        start_time = time.time()
+        if config.checkpoint_dir:
+            ckpt_dir = config.checkpoint_dir
+        else:
+            ckpt_dir = model_checkpoint_dir(model)
+
+        checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
+        assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
+        assert model_parallel_size == len(
+            checkpoints
+        ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
+        ckpt_path = checkpoints[get_model_parallel_rank()]
+        state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        with open(Path(ckpt_dir) / "params.json", "r") as f:
+            params = json.loads(f.read())
+
+        if "model" in params:
+            params = params["model"]
+
+        model_args: ModelArgs = ModelArgs(
+            max_seq_len=config.max_seq_len,
+            max_batch_size=config.max_batch_size,
+            **params,
+        )
+
+        tokenizer = Tokenizer.get_instance()
+        assert (
+            model_args.vocab_size == tokenizer.n_words
+        ), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
+
+        if isinstance(config, MetaReferenceQuantizedInferenceConfig):
+
+            if isinstance(config.quantization, Fp8QuantizationConfig):
+                from .quantization.loader import convert_to_fp8_quantized_model
+
+                # load on CPU in bf16 so that fp8 conversion does not find an
+                # unexpected (fp32, e.g.) datatype
+                torch.set_default_tensor_type(torch.BFloat16Tensor)
+                if model_args.vision_chunk_size > 0:
+                    model = CrossAttentionTransformer(model_args)
+                    model.setup_cache(model_args.max_batch_size, torch.bfloat16)
+                else:
+                    model = Transformer(model_args)
+                model.load_state_dict(state_dict, strict=False)
+                model = convert_to_fp8_quantized_model(model, config, ckpt_dir)
+            elif isinstance(config.quantization, Int4QuantizationConfig):
+                from .quantization.loader import convert_to_int4_quantized_model
+
+                model = Transformer(model_args)
+                model = convert_to_int4_quantized_model(model, model_args, config)
+                model.load_state_dict(state_dict, strict=True)
+
+                if (
+                    model_args.quantization_args is not None
+                    and model_args.quantization_args.spinquant
+                ):
+                    # Add a wrapper for adding hadamard transform for spinquant.
+                    # This needs to be done after loading the state dict otherwise an error will be raised while
+                    # loading the state dict.
+                    from .quantization.hadamard_utils import (
+                        add_hadamard_transform_for_spinquant,
+                    )
+
+                    add_hadamard_transform_for_spinquant(model)
+            else:
+                raise NotImplementedError(
+                    "Currently int4 and fp8 are the only supported quantization methods."
+                )
+        else:
+            if torch.cuda.is_bf16_supported():
+                torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
+            else:
+                torch.set_default_tensor_type(torch.cuda.HalfTensor)
+            if model_args.vision_chunk_size > 0:
+                model = CrossAttentionTransformer(model_args)
+                model.setup_cache(model_args.max_batch_size, torch.bfloat16)
+            else:
+                model = Transformer(model_args)
+            model.load_state_dict(state_dict, strict=False)
+
+        print(f"Loaded in {time.time() - start_time:.2f} seconds")
+        return Llama(model, tokenizer, model_args)
+
+    def __init__(self, model: Transformer, tokenizer: Tokenizer, args: ModelArgs):
+        self.args = args
+        self.model = model
+        self.tokenizer = tokenizer
+        self.formatter = ChatFormat(tokenizer)
+
+    @torch.inference_mode()
+    def generate(
+        self,
+        model_input: ModelInput,
+        max_gen_len: int,
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        logprobs: bool = False,
+        echo: bool = False,
+        include_stop_token: bool = False,
+        print_input_tokens: bool = False,
+        logits_processor: Optional["LogitsProcessor"] = None,
+    ) -> Generator:
+        params = self.model.params
+
+        if print_input_tokens:
+            input_tokens = [
+                self.formatter.vision_token if t == 128256 else t
+                for t in model_input.tokens
+            ]
+            cprint("Input to model -> " + self.tokenizer.decode(input_tokens), "red")
+        prompt_tokens = [model_input.tokens]
+
+        bsz = 1
+        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
+
+        min_prompt_len = min(len(t) for t in prompt_tokens)
+        max_prompt_len = max(len(t) for t in prompt_tokens)
+
+        if max_prompt_len >= params.max_seq_len:
+            cprint(
+                f"Out of token budget {max_prompt_len} vs {params.max_seq_len}", "red"
+            )
+            return
+
+        total_len = min(max_gen_len + max_prompt_len, params.max_seq_len)
+
+        is_vision = isinstance(self.model, CrossAttentionTransformer)
+        if is_vision:
+            images = model_input.vision.images if model_input.vision is not None else []
+            mask = model_input.vision.mask if model_input.vision is not None else []
+
+            # the method works for bsz > 1 so add a batch dimension
+            xattn_caches, cross_attention_masks, full_text_row_masked_out_mask = (
+                self.model.compute_vision_tokens_masks(
+                    batch_images=[images],
+                    batch_masks=[mask],
+                    total_len=total_len,
+                )
+            )
+
+        pad_id = self.tokenizer.pad_id
+        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
+        for k, t in enumerate(prompt_tokens):
+            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
+        if logprobs:
+            token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
+
+        prev_pos = 0
+        eos_reached = torch.tensor([False] * bsz, device="cuda")
+        input_text_mask = tokens != pad_id
+        if min_prompt_len == total_len:
+            # TODO(ashwin): unify this branch with the one below and figure out multimodal crap
+            logits = self.model.forward(tokens, prev_pos)
+            token_logprobs = -F.cross_entropy(
+                input=logits.transpose(1, 2),
+                target=tokens,
+                reduction="none",
+                ignore_index=pad_id,
+            )
+
+        stop_tokens = torch.tensor(self.tokenizer.stop_tokens, device="cuda")
+        for cur_pos in range(min_prompt_len, total_len):
+            if is_vision:
+                position_ids = torch.arange(
+                    prev_pos, cur_pos, dtype=torch.long, device="cuda"
+                )
+                logits = self.model.forward(
+                    position_ids,
+                    tokens,
+                    cross_attention_masks,
+                    full_text_row_masked_out_mask,
+                    xattn_caches,
+                )
+            else:
+                logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
+
+            if logits_processor is not None:
+                logits = logits_processor.process_logits(tokens[:, :cur_pos], logits)
+
+            if temperature > 0:
+                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
+                next_token = sample_top_p(probs, top_p)
+            else:
+                next_token = torch.argmax(logits[:, -1], dim=-1)
+
+            next_token = next_token.reshape(-1)
+            # only replace token if prompt has already been generated
+            next_token = torch.where(
+                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
+            )
+            tokens[:, cur_pos] = next_token
+
+            target = tokens[:, prev_pos + 1 : cur_pos + 1]
+            if is_vision:
+                # the logits space (num_classes) is designed to never contain a media_token
+                # however our input token stream does contain them. we need to nuke them here
+                # or else the CUDA kernels will crash with an illegal memory access
+                vision_tokens = [self.tokenizer.special_tokens["<|image|>"], 128256]
+                masks = [target.eq(t) for t in vision_tokens]
+                if len(masks) > 1:
+                    mask = torch.logical_or(*masks)
+                else:
+                    mask = masks[0]
+                target[mask] = 0
+
+            if logprobs:
+                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
+                    input=logits.transpose(1, 2),
+                    target=tokens[:, prev_pos + 1 : cur_pos + 1],
+                    reduction="none",
+                    ignore_index=pad_id,
+                )
+            eos_reached |= (~input_text_mask[:, cur_pos]) & (
+                torch.isin(next_token, stop_tokens)
+            )
+            yield TokenResult(
+                token=next_token[0].item(),
+                text=self.tokenizer.decode(next_token.tolist()),
+                logprobs=(
+                    token_logprobs[:, cur_pos : cur_pos + 1][0].tolist()
+                    if logprobs
+                    else None
+                ),
+            )
+
+            prev_pos = cur_pos
+            if all(eos_reached):
+                break
+
+    def completion(
+        self,
+        request: CompletionRequest,
+    ) -> Generator:
+        sampling_params = request.sampling_params
+        max_gen_len = sampling_params.max_tokens
+        if (
+            max_gen_len is None
+            or max_gen_len == 0
+            or max_gen_len >= self.model.params.max_seq_len
+        ):
+            max_gen_len = self.model.params.max_seq_len - 1
+
+        content = augment_content_with_response_format_prompt(
+            request.response_format, request.content
+        )
+        model_input = self.formatter.encode_content(content)
+        yield from self.generate(
+            model_input=model_input,
+            max_gen_len=max_gen_len,
+            temperature=sampling_params.temperature,
+            top_p=sampling_params.top_p,
+            logprobs=bool(request.logprobs),
+            include_stop_token=True,
+            logits_processor=get_logits_processor(
+                self.tokenizer,
+                self.args.vocab_size,
+                request.response_format,
+            ),
+        )
+
+    def chat_completion(
+        self,
+        request: ChatCompletionRequest,
+    ) -> Generator:
+        messages = chat_completion_request_to_messages(request)
+
+        sampling_params = request.sampling_params
+        max_gen_len = sampling_params.max_tokens
+        if (
+            max_gen_len is None
+            or max_gen_len == 0
+            or max_gen_len >= self.model.params.max_seq_len
+        ):
+            max_gen_len = self.model.params.max_seq_len - 1
+
+        yield from self.generate(
+            model_input=self.formatter.encode_dialog_prompt(
+                messages,
+                request.tool_prompt_format,
+            ),
+            max_gen_len=max_gen_len,
+            temperature=sampling_params.temperature,
+            top_p=sampling_params.top_p,
+            logprobs=bool(request.logprobs),
+            include_stop_token=True,
+            logits_processor=get_logits_processor(
+                self.tokenizer,
+                self.args.vocab_size,
+                request.response_format,
+            ),
+        )
+
+
+def sample_top_p(probs, p):
+    """
+    Perform top-p (nucleus) sampling on a probability distribution.
+
+    Args:
+        probs (torch.Tensor): Probability distribution tensor.
+        p (float): Probability threshold for top-p sampling.
+
+    Returns:
+        torch.Tensor: Sampled token indices.
+
+    Note:
+        Top-p sampling selects the smallest set of tokens whose cumulative probability mass
+        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
+    """
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+
+
+class LogitsProcessor:
+    def __init__(self, token_enforcer: TokenEnforcer):
+        self.token_enforcer = token_enforcer
+        self.mask: Optional[torch.Tensor] = None
+
+    def process_logits(
+        self, tokens: torch.Tensor, scores: torch.Tensor
+    ) -> torch.Tensor:
+        token_sequence = tokens[0, :].tolist()
+        allowed_tokens = self.token_enforcer.get_allowed_tokens(token_sequence)
+
+        if self.mask is not None:
+            self.mask.fill_(-math.inf)
+        else:
+            self.mask = torch.full_like(scores, -math.inf)
+
+        self.mask[:, :, allowed_tokens] = 0
+        scores = scores + self.mask
+        return scores
+
+
+def get_logits_processor(
+    tokenizer: Tokenizer,
+    vocab_size: int,
+    response_format: Optional[ResponseFormat],
+) -> Optional["LogitsProcessor"]:
+    if response_format is None:
+        return None
+
+    if response_format.type != ResponseFormatType.json_schema.value:
+        raise ValueError(f"Unsupported response format type {response_format.type}")
+
+    parser = JsonSchemaParser(response_format.json_schema)
+    data = TokenEnforcerTokenizerData(
+        _build_regular_tokens_list(tokenizer, vocab_size),
+        tokenizer.decode,
+        tokenizer.stop_tokens,
+    )
+    token_enforcer = TokenEnforcer(data, parser)
+    return LogitsProcessor(token_enforcer)
+
+
+def _build_regular_tokens_list(
+    tokenizer: Tokenizer, vocab_size: int
+) -> List[Tuple[int, str, bool]]:
+    token_0 = tokenizer.encode("0", bos=False, eos=False)[-1]
+    regular_tokens = []
+
+    special_token_ids = set(tokenizer.special_tokens.values())
+    for token_idx in range(vocab_size):
+        if token_idx in special_token_ids:
+            continue
+
+        # We prepend token 0 and skip the first letter of the result to get a space if the token is a start word.
+        decoded_after_0 = tokenizer.decode([token_0, token_idx])[1:]
+        decoded_regular = tokenizer.decode([token_idx])
+        is_word_start_token = len(decoded_after_0) > len(decoded_regular)
+        regular_tokens.append((token_idx, decoded_after_0, is_word_start_token))
+    return regular_tokens
--- a/llama_stack/providers/inline/meta_reference/inference/inference.py
+++ b/llama_stack/providers/inline/meta_reference/inference/inference.py
@ -0,0 +1,425 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+
+from typing import AsyncGenerator, List
+
+from llama_models.sku_list import resolve_model
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.providers.datatypes import ModelDef, ModelsProtocolPrivate
+
+from llama_stack.providers.utils.inference.prompt_adapter import (
+    convert_image_media_to_url,
+    request_has_media,
+)
+
+from .config import MetaReferenceInferenceConfig
+from .generation import Llama
+from .model_parallel import LlamaModelParallelGenerator
+
+# there's a single model parallel process running serving the model. for now,
+# we don't support multiple concurrent requests to this process.
+SEMAPHORE = asyncio.Semaphore(1)
+
+
+class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
+    def __init__(self, config: MetaReferenceInferenceConfig) -> None:
+        self.config = config
+        model = resolve_model(config.model)
+        if model is None:
+            raise RuntimeError(f"Unknown model: {config.model}, Run `llama model list`")
+        self.model = model
+        # verify that the checkpoint actually is for this model lol
+
+    async def initialize(self) -> None:
+        print(f"Loading model `{self.model.descriptor()}`")
+        if self.config.create_distributed_process_group:
+            self.generator = LlamaModelParallelGenerator(self.config)
+            self.generator.start()
+        else:
+            self.generator = Llama.build(self.config)
+
+    async def register_model(self, model: ModelDef) -> None:
+        raise ValueError("Dynamic model registration is not supported")
+
+    async def list_models(self) -> List[ModelDef]:
+        return [
+            ModelDef(
+                identifier=self.model.descriptor(),
+                llama_model=self.model.descriptor(),
+            )
+        ]
+
+    async def shutdown(self) -> None:
+        if self.config.create_distributed_process_group:
+            self.generator.stop()
+
+    def check_model(self, request) -> None:
+        model = resolve_model(request.model)
+        if model is None:
+            raise RuntimeError(
+                f"Unknown model: {request.model}, Run `llama model list`"
+            )
+        elif model.descriptor() != self.model.descriptor():
+            raise RuntimeError(
+                f"Model mismatch: {request.model} != {self.model.descriptor()}"
+            )
+
+    async def completion(
+        self,
+        model: str,
+        content: InterleavedTextMedia,
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
+        if logprobs:
+            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
+
+        request = CompletionRequest(
+            model=model,
+            content=content,
+            sampling_params=sampling_params,
+            response_format=response_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+        self.check_model(request)
+        request = await request_with_localized_media(request)
+
+        if request.stream:
+            return self._stream_completion(request)
+        else:
+            return await self._nonstream_completion(request)
+
+    async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
+        def impl():
+            stop_reason = None
+
+            for token_result in self.generator.completion(request):
+                if token_result.text == "<|eot_id|>":
+                    stop_reason = StopReason.end_of_turn
+                    text = ""
+                elif token_result.text == "<|eom_id|>":
+                    stop_reason = StopReason.end_of_message
+                    text = ""
+                else:
+                    text = token_result.text
+
+                logprobs = None
+                if stop_reason is None:
+                    if request.logprobs:
+                        assert len(token_result.logprobs) == 1
+
+                        logprobs = [
+                            TokenLogProbs(
+                                logprobs_by_token={
+                                    token_result.text: token_result.logprobs[0]
+                                }
+                            )
+                        ]
+
+                yield CompletionResponseStreamChunk(
+                    delta=text,
+                    stop_reason=stop_reason,
+                    logprobs=logprobs if request.logprobs else None,
+                )
+
+            if stop_reason is None:
+                yield CompletionResponseStreamChunk(
+                    delta="",
+                    stop_reason=StopReason.out_of_tokens,
+                )
+
+        if self.config.create_distributed_process_group:
+            async with SEMAPHORE:
+                for x in impl():
+                    yield x
+        else:
+            for x in impl():
+                yield x
+
+    async def _nonstream_completion(
+        self, request: CompletionRequest
+    ) -> CompletionResponse:
+        def impl():
+            tokens = []
+            logprobs = []
+            stop_reason = None
+
+            tokenizer = self.generator.formatter.tokenizer
+            for token_result in self.generator.completion(request):
+                tokens.append(token_result.token)
+
+                if token_result.token in tokenizer.stop_tokens:
+                    # not quite right semantically
+                    stop_reason = StopReason.end_of_turn
+
+                if request.logprobs:
+                    assert len(token_result.logprobs) == 1
+
+                    logprobs.append(
+                        TokenLogProbs(
+                            logprobs_by_token={
+                                token_result.text: token_result.logprobs[0]
+                            }
+                        )
+                    )
+
+            if stop_reason is None:
+                stop_reason = StopReason.out_of_tokens
+
+            content = self.generator.formatter.tokenizer.decode(tokens)
+            return CompletionResponse(
+                content=content,
+                stop_reason=stop_reason,
+                logprobs=logprobs if request.logprobs else None,
+            )
+
+        if self.config.create_distributed_process_group:
+            async with SEMAPHORE:
+                return impl()
+        else:
+            return impl()
+
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        if logprobs:
+            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
+
+        # wrapper request to make it easier to pass around (internal only, not exposed to API)
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools or [],
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            response_format=response_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+        self.check_model(request)
+        request = await request_with_localized_media(request)
+
+        if self.config.create_distributed_process_group:
+            if SEMAPHORE.locked():
+                raise RuntimeError("Only one concurrent request is supported")
+
+        if request.stream:
+            return self._stream_chat_completion(request)
+        else:
+            return await self._nonstream_chat_completion(request)
+
+    async def _nonstream_chat_completion(
+        self, request: ChatCompletionRequest
+    ) -> ChatCompletionResponse:
+        def impl():
+            tokens = []
+            logprobs = []
+            stop_reason = None
+
+            for token_result in self.generator.chat_completion(request):
+                tokens.append(token_result.token)
+
+                if token_result.text == "<|eot_id|>":
+                    stop_reason = StopReason.end_of_turn
+                elif token_result.text == "<|eom_id|>":
+                    stop_reason = StopReason.end_of_message
+
+                if request.logprobs:
+                    assert len(token_result.logprobs) == 1
+
+                    logprobs.append(
+                        TokenLogProbs(
+                            logprobs_by_token={
+                                token_result.text: token_result.logprobs[0]
+                            }
+                        )
+                    )
+
+            if stop_reason is None:
+                stop_reason = StopReason.out_of_tokens
+
+            message = self.generator.formatter.decode_assistant_message(
+                tokens, stop_reason
+            )
+            return ChatCompletionResponse(
+                completion_message=message,
+                logprobs=logprobs if request.logprobs else None,
+            )
+
+        if self.config.create_distributed_process_group:
+            async with SEMAPHORE:
+                return impl()
+        else:
+            return impl()
+
+    async def _stream_chat_completion(
+        self, request: ChatCompletionRequest
+    ) -> AsyncGenerator:
+        def impl():
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.start,
+                    delta="",
+                )
+            )
+
+            tokens = []
+            logprobs = []
+            stop_reason = None
+            ipython = False
+
+            for token_result in self.generator.chat_completion(request):
+                tokens.append(token_result.token)
+
+                if not ipython and token_result.text.startswith("<|python_tag|>"):
+                    ipython = True
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.started,
+                            ),
+                        )
+                    )
+                    continue
+
+                if token_result.text == "<|eot_id|>":
+                    stop_reason = StopReason.end_of_turn
+                    text = ""
+                elif token_result.text == "<|eom_id|>":
+                    stop_reason = StopReason.end_of_message
+                    text = ""
+                else:
+                    text = token_result.text
+
+                if ipython:
+                    delta = ToolCallDelta(
+                        content=text,
+                        parse_status=ToolCallParseStatus.in_progress,
+                    )
+                else:
+                    delta = text
+
+                if stop_reason is None:
+                    if request.logprobs:
+                        assert len(token_result.logprobs) == 1
+
+                        logprobs.append(
+                            TokenLogProbs(
+                                logprobs_by_token={
+                                    token_result.text: token_result.logprobs[0]
+                                }
+                            )
+                        )
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=delta,
+                            stop_reason=stop_reason,
+                            logprobs=logprobs if request.logprobs else None,
+                        )
+                    )
+
+            if stop_reason is None:
+                stop_reason = StopReason.out_of_tokens
+
+            message = self.generator.formatter.decode_assistant_message(
+                tokens, stop_reason
+            )
+
+            parsed_tool_calls = len(message.tool_calls) > 0
+            if ipython and not parsed_tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content="",
+                            parse_status=ToolCallParseStatus.failure,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            for tool_call in message.tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content=tool_call,
+                            parse_status=ToolCallParseStatus.success,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.complete,
+                    delta="",
+                    stop_reason=stop_reason,
+                )
+            )
+
+        if self.config.create_distributed_process_group:
+            async with SEMAPHORE:
+                for x in impl():
+                    yield x
+        else:
+            for x in impl():
+                yield x
+
+    async def embeddings(
+        self,
+        model: str,
+        contents: List[InterleavedTextMedia],
+    ) -> EmbeddingsResponse:
+        raise NotImplementedError()
+
+
+async def request_with_localized_media(
+    request: Union[ChatCompletionRequest, CompletionRequest],
+) -> Union[ChatCompletionRequest, CompletionRequest]:
+    if not request_has_media(request):
+        return request
+
+    async def _convert_single_content(content):
+        if isinstance(content, ImageMedia):
+            url = await convert_image_media_to_url(content, download=True)
+            return ImageMedia(image=URL(uri=url))
+        else:
+            return content
+
+    async def _convert_content(content):
+        if isinstance(content, list):
+            return [await _convert_single_content(c) for c in content]
+        else:
+            return await _convert_single_content(content)
+
+    if isinstance(request, ChatCompletionRequest):
+        for m in request.messages:
+            m.content = await _convert_content(m.content)
+    else:
+        request.content = await _convert_content(request.content)
+
+    return request
--- a/llama_stack/providers/inline/meta_reference/inference/model_parallel.py
+++ b/llama_stack/providers/inline/meta_reference/inference/model_parallel.py
@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+from copy import deepcopy
+from functools import partial
+from typing import Any, Generator
+
+from llama_models.llama3.api.chat_format import ChatFormat
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.sku_list import resolve_model
+
+from llama_stack.apis.inference import ChatCompletionRequest, CompletionRequest
+
+from .config import MetaReferenceInferenceConfig
+from .generation import Llama, model_checkpoint_dir
+from .parallel_utils import ModelParallelProcessGroup
+
+
+class ModelRunner:
+    def __init__(self, llama):
+        self.llama = llama
+
+    # the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()`
+    def __call__(self, req: Any):
+        if isinstance(req, ChatCompletionRequest):
+            return self.llama.chat_completion(req)
+        elif isinstance(req, CompletionRequest):
+            return self.llama.completion(req)
+        else:
+            raise ValueError(f"Unexpected task type {type(req)}")
+
+
+def init_model_cb(config: MetaReferenceInferenceConfig):
+    llama = Llama.build(config)
+    return ModelRunner(llama)
+
+
+class LlamaModelParallelGenerator:
+    """
+    This abstraction exists so
+     - we can run model parallel code without needing to run the CLIs via torchrun
+     - this also enables use model parallel code within a notebook context.
+
+    A Context Manager is used to ensure that the model parallel process is started and stopped
+    correctly. This does make the ergonomics a little awkward, because it isn't immediately
+    clear at the callsite why we need to use a context manager.
+    """
+
+    def __init__(self, config: MetaReferenceInferenceConfig):
+        self.config = config
+        self.model = resolve_model(self.config.model)
+        # this is a hack because Agent's loop uses this to tokenize and check if input is too long
+        # while the tool-use loop is going
+        checkpoint_dir = model_checkpoint_dir(self.model)
+        tokenizer_path = os.path.join(checkpoint_dir, "tokenizer.model")
+        self.formatter = ChatFormat(Tokenizer(tokenizer_path))
+
+    def start(self):
+        self.__enter__()
+
+    def stop(self):
+        self.__exit__(None, None, None)
+
+    def __enter__(self):
+        self.group = ModelParallelProcessGroup(
+            self.config.model_parallel_size,
+            init_model_cb=partial(init_model_cb, self.config),
+        )
+        self.group.start()
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.group.stop()
+
+    def completion(
+        self,
+        request: CompletionRequest,
+    ) -> Generator:
+        req_obj = deepcopy(request)
+        gen = self.group.run_inference(req_obj)
+        yield from gen
+
+    def chat_completion(
+        self,
+        request: ChatCompletionRequest,
+    ) -> Generator:
+        req_obj = deepcopy(request)
+        gen = self.group.run_inference(req_obj)
+        yield from gen
--- a/llama_stack/providers/inline/meta_reference/inference/parallel_utils.py
+++ b/llama_stack/providers/inline/meta_reference/inference/parallel_utils.py
@ -0,0 +1,378 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, IAny, nc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import multiprocessing
+import os
+import tempfile
+import time
+import uuid
+from enum import Enum
+from typing import Callable, Generator, Literal, Optional, Union
+
+import torch
+import zmq
+
+from fairscale.nn.model_parallel.initialize import (
+    get_model_parallel_group,
+    get_model_parallel_rank,
+    get_model_parallel_src_rank,
+)
+
+from pydantic import BaseModel, Field
+
+from torch.distributed.launcher.api import elastic_launch, LaunchConfig
+from typing_extensions import Annotated
+
+from llama_stack.apis.inference import ChatCompletionRequest, CompletionRequest
+
+from .generation import TokenResult
+
+
+class ProcessingMessageName(str, Enum):
+    ready_request = "ready_request"
+    ready_response = "ready_response"
+    end_sentinel = "end_sentinel"
+    cancel_sentinel = "cancel_sentinel"
+    task_request = "task_request"
+    task_response = "task_response"
+    exception_response = "exception_response"
+
+
+class ReadyRequest(BaseModel):
+    type: Literal[ProcessingMessageName.ready_request] = (
+        ProcessingMessageName.ready_request
+    )
+
+
+class ReadyResponse(BaseModel):
+    type: Literal[ProcessingMessageName.ready_response] = (
+        ProcessingMessageName.ready_response
+    )
+
+
+class EndSentinel(BaseModel):
+    type: Literal[ProcessingMessageName.end_sentinel] = (
+        ProcessingMessageName.end_sentinel
+    )
+
+
+class CancelSentinel(BaseModel):
+    type: Literal[ProcessingMessageName.cancel_sentinel] = (
+        ProcessingMessageName.cancel_sentinel
+    )
+
+
+class TaskRequest(BaseModel):
+    type: Literal[ProcessingMessageName.task_request] = (
+        ProcessingMessageName.task_request
+    )
+    task: Union[CompletionRequest, ChatCompletionRequest]
+
+
+class TaskResponse(BaseModel):
+    type: Literal[ProcessingMessageName.task_response] = (
+        ProcessingMessageName.task_response
+    )
+    result: TokenResult
+
+
+class ExceptionResponse(BaseModel):
+    type: Literal[ProcessingMessageName.exception_response] = (
+        ProcessingMessageName.exception_response
+    )
+    error: str
+
+
+ProcessingMessage = Union[
+    ReadyRequest,
+    ReadyResponse,
+    EndSentinel,
+    CancelSentinel,
+    TaskRequest,
+    TaskResponse,
+    ExceptionResponse,
+]
+
+
+class ProcessingMessageWrapper(BaseModel):
+    payload: Annotated[
+        ProcessingMessage,
+        Field(discriminator="type"),
+    ]
+
+
+def mp_rank_0() -> bool:
+    return get_model_parallel_rank() == 0
+
+
+def encode_msg(msg: ProcessingMessage) -> bytes:
+    return ProcessingMessageWrapper(payload=msg).model_dump_json().encode("utf-8")
+
+
+def retrieve_requests(reply_socket_url: str):
+    if mp_rank_0():
+        context = zmq.Context()
+        reply_socket = context.socket(zmq.ROUTER)
+        reply_socket.connect(reply_socket_url)
+
+        while True:
+            client_id, obj = maybe_get_work(reply_socket)
+            if obj is None:
+                time.sleep(0.01)
+                continue
+
+            ready_response = ReadyResponse()
+            reply_socket.send_multipart([client_id, encode_msg(ready_response)])
+            break
+
+    def send_obj(obj: ProcessingMessage):
+        reply_socket.send_multipart([client_id, encode_msg(obj)])
+
+    while True:
+        tasks = [None]
+        if mp_rank_0():
+            client_id, maybe_task_json = maybe_get_work(reply_socket)
+            if maybe_task_json is not None:
+                task = maybe_parse_message(maybe_task_json)
+                # there is still an unknown unclean GeneratorExit happening resulting in a
+                # cancel sentinel getting queued _after_ we have finished sending everything :/
+                # kind of a hack this is :/
+                if task is not None and not isinstance(task, CancelSentinel):
+                    tasks = [task]
+
+        torch.distributed.broadcast_object_list(
+            tasks,
+            src=get_model_parallel_src_rank(),
+            group=get_model_parallel_group(),
+        )
+
+        task = tasks[0]
+        if task is None:
+            time.sleep(0.1)
+        else:
+            try:
+                out = yield task
+                if out is None:
+                    break
+
+                for obj in out:
+                    updates = [None]
+                    if mp_rank_0():
+                        _, update_json = maybe_get_work(reply_socket)
+                        update = maybe_parse_message(update_json)
+                        if isinstance(update, CancelSentinel):
+                            updates = [update]
+                        else:
+                            # only send the update if it's not cancelled otherwise the object sits in the socket
+                            # and gets pulled in the next request lol
+                            send_obj(TaskResponse(result=obj))
+
+                    torch.distributed.broadcast_object_list(
+                        updates,
+                        src=get_model_parallel_src_rank(),
+                        group=get_model_parallel_group(),
+                    )
+                    if isinstance(updates[0], CancelSentinel):
+                        print("quitting generation loop because request was cancelled")
+                        break
+
+                if mp_rank_0():
+                    send_obj(EndSentinel())
+            except Exception as e:
+                print(f"[debug] got exception {e}")
+                import traceback
+
+                traceback.print_exc()
+                if mp_rank_0():
+                    send_obj(ExceptionResponse(error=str(e)))
+
+    if mp_rank_0():
+        send_obj(EndSentinel())
+
+
+def maybe_get_work(sock: zmq.Socket):
+    message = None
+    client_id = None
+    try:
+        client_id, obj = sock.recv_multipart(zmq.NOBLOCK)
+        message = obj.decode("utf-8")
+    except zmq.ZMQError as e:
+        if e.errno != zmq.EAGAIN:
+            raise e
+
+    return client_id, message
+
+
+def maybe_parse_message(maybe_json: Optional[str]) -> Optional[ProcessingMessage]:
+    if maybe_json is None:
+        return None
+    try:
+        return parse_message(maybe_json)
+    except json.JSONDecodeError:
+        return None
+    except ValueError as e:
+        return None
+
+
+def parse_message(json_str: str) -> ProcessingMessage:
+    data = json.loads(json_str)
+    return ProcessingMessageWrapper(**data).payload
+
+
+def worker_process_entrypoint(
+    reply_socket_url: str,
+    init_model_cb: Callable,
+) -> None:
+    model = init_model_cb()
+    torch.distributed.barrier()
+    time.sleep(1)
+
+    # run the requests co-routine which retrieves requests from the socket
+    # and sends responses (we provide) back to the caller
+    req_gen = retrieve_requests(reply_socket_url)
+    result = None
+    while True:
+        try:
+            task = req_gen.send(result)
+            if isinstance(task, str) and task == _END_SENTINEL:
+                break
+
+            assert isinstance(task, TaskRequest)
+            result = model(task.task)
+        except StopIteration:
+            break
+
+    print("[debug] worker process done")
+
+
+def launch_dist_group(
+    reply_socket_url: str,
+    model_parallel_size: int,
+    init_model_cb: Callable,
+    **kwargs,
+) -> None:
+    id = uuid.uuid4().hex
+    dist_url = f"file:///tmp/llama3_{id}_{time.time()}"
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # TODO: track workers and if they terminate, tell parent process about it so cleanup can happen
+        launch_config = LaunchConfig(
+            max_nodes=1,
+            min_nodes=1,
+            nproc_per_node=model_parallel_size,
+            start_method="fork",
+            rdzv_backend="c10d",
+            rdzv_endpoint=os.path.join(tmpdir, "rdzv"),
+            rdzv_configs={"store_type": "file", "timeout": 90},
+            max_restarts=0,
+            monitor_interval=1,
+            run_id=str(uuid.uuid4()),
+        )
+        elastic_launch(launch_config, entrypoint=worker_process_entrypoint)(
+            reply_socket_url,
+            init_model_cb,
+        )
+
+
+def start_model_parallel_process(
+    model_parallel_size: int,
+    init_model_cb: Callable,
+    **kwargs,
+):
+    context = zmq.Context()
+    request_socket = context.socket(zmq.DEALER)
+
+    # Binding the request socket to a random port
+    request_socket.bind("tcp://127.0.0.1:0")
+
+    main_process_url = request_socket.getsockopt_string(zmq.LAST_ENDPOINT)
+
+    ctx = multiprocessing.get_context("fork")
+    process = ctx.Process(
+        target=launch_dist_group,
+        args=(
+            main_process_url,
+            model_parallel_size,
+            init_model_cb,
+        ),
+        kwargs=kwargs,
+    )
+    process.start()
+
+    # wait until the model is loaded; rank 0 will send a message to indicate it's ready
+
+    request_socket.send(encode_msg(ReadyRequest()))
+    response = request_socket.recv()
+    print("Loaded model...")
+
+    return request_socket, process
+
+
+class ModelParallelProcessGroup:
+    def __init__(
+        self,
+        model_parallel_size: int,
+        init_model_cb: Callable,
+        **kwargs,
+    ):
+        self.model_parallel_size = model_parallel_size
+        self.init_model_cb = init_model_cb
+        self.started = False
+        self.running = False
+
+    def start(self):
+        assert not self.started, "process group already started"
+        self.request_socket, self.process = start_model_parallel_process(
+            self.model_parallel_size,
+            self.init_model_cb,
+        )
+        self.started = True
+
+    def stop(self):
+        assert self.started, "process group not started"
+        if self.process.is_alive():
+            self.request_socket.send(encode_msg(EndSentinel()), zmq.NOBLOCK)
+            self.process.join()
+        self.started = False
+
+    def run_inference(
+        self, req: Union[CompletionRequest, ChatCompletionRequest]
+    ) -> Generator:
+        assert not self.running, "inference already running"
+
+        self.running = True
+        self.request_socket.send(encode_msg(TaskRequest(task=req)))
+        try:
+            while True:
+                obj_json = self.request_socket.recv()
+                obj = parse_message(obj_json)
+
+                if isinstance(obj, EndSentinel):
+                    break
+
+                if isinstance(obj, ExceptionResponse):
+                    print(f"[debug] got exception {obj.error}")
+                    raise Exception(obj.error)
+
+                if isinstance(obj, TaskResponse):
+                    yield obj.result
+
+        except GeneratorExit as e:
+            self.request_socket.send(encode_msg(CancelSentinel()))
+            while True:
+                obj_json = self.request_socket.send()
+                obj = parse_message(obj_json)
+                if isinstance(obj, EndSentinel):
+                    break
+        finally:
+            self.running = False
--- a/llama_stack/providers/inline/meta_reference/inference/quantization/init.py
+++ b/llama_stack/providers/inline/meta_reference/inference/quantization/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/inline/meta_reference/inference/quantization/fp8_impls.py
+++ b/llama_stack/providers/inline/meta_reference/inference/quantization/fp8_impls.py
@ -0,0 +1,184 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import collections
+from typing import Optional, Type
+
+try:
+    import fbgemm_gpu.experimental.gen_ai  # noqa: F401
+
+    print("Using efficient FP8 operators in FBGEMM.")
+except ImportError:
+    print("No efficient FP8 operators. Please install FBGEMM in fp8_requirements.txt.")
+    raise
+
+import torch
+from torch import nn, Tensor
+
+
+class Fp8ScaledWeights:
+    # TODO: Ugly trick so torch allows us to replace parameters
+    # with our custom Fp8Weights instance. Do this properly.
+    @property
+    def __class__(self) -> Type[nn.parameter.Parameter]:
+        return nn.Parameter
+
+    @property
+    def grad_fn(self) -> None:
+        return None
+
+
+# pyre-fixme[4]: Attribute annotation cannot be `Any`.
+# pyre-fixme[2]: Parameter annotation cannot be `Any`.
+class Fp8RowwiseWeights(
+    Fp8ScaledWeights,
+    collections.namedtuple(
+        "Fp8RowwiseWeights",
+        ["weight", "scale", "shape", "activation_scale_ub"],
+    ),
+):
+    pass
+
+
+def ffn_swiglu(
+    x: Tensor,
+    w1: Fp8RowwiseWeights,
+    w3: Fp8RowwiseWeights,
+    w2: Fp8RowwiseWeights,
+    num_tokens: Optional[Tensor] = None,
+    is_memory_bounded: bool = False,
+) -> Tensor:
+    if (
+        isinstance(w1, Fp8ScaledWeights)
+        and isinstance(w3, Fp8ScaledWeights)
+        and isinstance(w2, Fp8ScaledWeights)
+    ):
+        return ffn_swiglu_fp8_dynamic(
+            x, w1, w3, w2, w1.activation_scale_ub, num_tokens, is_memory_bounded
+        )
+
+    (B, T, D) = x.shape  # noqa: N806
+    (HD_L, D_) = w1.shape  # noqa: N806
+    assert D_ == D
+
+    assert isinstance(w1, Tensor)
+    assert isinstance(w3, Tensor)
+    x1 = x.view(B * T, D) @ w1.T
+    x2 = x.view(B * T, D) @ w3.T
+    z = torch.nn.functional.silu(x1) * x2
+    del x1, x2
+    assert isinstance(w2, Tensor)
+    return (z @ w2.T).view(B, T, D)
+
+
+@torch.inference_mode()
+def quantize_fp8(
+    w: Tensor,
+    fp8_activation_scale_ub: float,
+    output_device: Optional[torch.device] = None,
+) -> Fp8RowwiseWeights:
+    """Quantize [n, k] weight tensor.
+
+    Args:
+        w (Tensor): [n, k] input high precision tensor to quantize.
+        fp8_activation_scale_ub (float): Upper bound for activation max.
+    """
+    activation_scale_ub = torch.tensor(
+        [fp8_activation_scale_ub],
+        dtype=torch.float,
+        device="cuda",
+    )
+    wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
+    del w
+    return Fp8RowwiseWeights(
+        weight=wq,
+        scale=w_scale,
+        shape=wq.shape,
+        activation_scale_ub=activation_scale_ub,
+    )
+
+
+@torch.inference_mode()
+def load_fp8(
+    w: Tensor,
+    w_scale: Tensor,
+    fp8_activation_scale_ub: float,
+) -> Fp8RowwiseWeights:
+    """Load FP8 [n, k] weight tensor.
+
+    Args:
+        w (Tensor): [n, k] input FP8.
+        fp8_activation_scale_ub (float): Upper bound for activation max.
+    """
+    activation_scale_ub = torch.tensor(
+        [fp8_activation_scale_ub],
+        dtype=torch.float,
+        device="cuda",
+    )
+    return Fp8RowwiseWeights(
+        weight=w.to(torch.float8_e4m3fn).to(device="cuda"),
+        scale=w_scale.to(device="cuda"),
+        shape=w.shape,
+        activation_scale_ub=activation_scale_ub,
+    )
+
+
+def fc_fp8_dynamic(
+    x: Tensor,
+    w: Fp8RowwiseWeights,
+    activation_scale_ub: Optional[Tensor] = None,
+    num_tokens: Optional[Tensor] = None,
+    is_memory_bounded: bool = False,
+) -> Tensor:
+    """
+    Single w8a8 fc layer with dynamic row-wise scaling.
+    """
+    if isinstance(w, Fp8RowwiseWeights):
+        xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
+            x, num_tokens, activation_scale_ub
+        )
+        y = torch.ops.fbgemm.f8f8bf16_rowwise(
+            xq, w.weight, x_scale, w.scale, use_fast_accum=True
+        )
+    del xq
+    return y
+
+
+def ffn_swiglu_fp8_dynamic(
+    x: Tensor,
+    w1: Fp8RowwiseWeights,
+    w3: Fp8RowwiseWeights,
+    w2: Fp8RowwiseWeights,
+    activation_scale_ub: Optional[Tensor] = None,
+    num_tokens: Optional[Tensor] = None,
+    is_memory_bounded: bool = False,
+) -> Tensor:
+    (B, T, D) = x.shape  # noqa: N806
+    HD_L = w1.shape[0]  # noqa: N806
+    assert HD_L == w3.shape[0]
+    x1 = fc_fp8_dynamic(
+        x.view(B * T, D),
+        w1,
+        activation_scale_ub,
+        num_tokens,
+        is_memory_bounded,
+    )
+    x2 = fc_fp8_dynamic(
+        x.view(B * T, D),
+        w3,
+        activation_scale_ub,
+        num_tokens,
+        is_memory_bounded,
+    )
+    z = torch.nn.functional.silu(x1) * x2
+    del x1, x2
+
+    z_ = fc_fp8_dynamic(z, w2, activation_scale_ub, num_tokens, is_memory_bounded)
+
+    return z_.view(B, T, D)
--- a/llama_stack/providers/inline/meta_reference/inference/quantization/fp8_txest_disabled.py
+++ b/llama_stack/providers/inline/meta_reference/inference/quantization/fp8_txest_disabled.py
@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import unittest
+
+import torch
+
+from fp8_impls import ffn_swiglu_fp8_dynamic, FfnQuantizeMode, quantize_fp8
+from hypothesis import given, settings, strategies as st
+from torch import Tensor
+
+
+@unittest.skipIf(
+    not torch.cuda.is_available()
+    or torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9,
+    "Skip when H100 is not available",
+)
+class FP8Tests(unittest.TestCase):
+    @settings(deadline=None)
+    @given(
+        D=st.sampled_from([4096, 8192]),
+        HD_L=st.sampled_from([1280, 2560]),
+        B=st.sampled_from([1, 2]),
+        T=st.sampled_from([2048, 4096]),
+        UB=st.sampled_from([1000, 10000]),
+    )
+    def test_fp8_ffn(
+        self,
+        D: int,  # noqa
+        HD_L: int,
+        B: int,
+        T: int,
+        UB: float,
+    ) -> None:
+        x = torch.randn(size=(B, T, D), dtype=torch.bfloat16, device="cuda") * 0.1
+        w1 = torch.randn(size=(HD_L, D), dtype=torch.bfloat16, device="cuda") * 0.01
+        w3 = torch.randn(size=(HD_L, D), dtype=torch.bfloat16, device="cuda") * 0.01
+        w2 = torch.randn(size=(D, HD_L), dtype=torch.bfloat16, device="cuda") * 0.1
+
+        x_q = quantize_fp8(x, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
+        w1_q = quantize_fp8(w1, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
+        w3_q = quantize_fp8(w3, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
+        w2_q = quantize_fp8(w2, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
+
+        def ref_ffn(x: Tensor, w1: Tensor, w3: Tensor, w2: Tensor) -> Tensor:
+            (B, T, D) = x.shape  # noqa: N806
+            (HD_L, D_) = w1.shape  # noqa: N806
+            assert D_ == D
+
+            x1 = x.view(B * T, D) @ w1.T
+            x2 = x.view(B * T, D) @ w3.T
+
+            z = torch.nn.functional.silu(x1) * x2
+            return (z @ w2.T).view(B, T, D).to(torch.bfloat16)
+
+        v = ffn_swiglu_fp8_dynamic(x, w1_q, w3_q, w2_q)
+
+        # Fake quant
+        x = x_q.weight.bfloat16() * x_q.scale.unsqueeze(-1)
+        w1 = w1_q.weight.bfloat16() * w1_q.scale.unsqueeze(-1)
+        w3 = w3_q.weight.bfloat16() * w3_q.scale.unsqueeze(-1)
+        w2 = w2_q.weight.bfloat16() * w2_q.scale.unsqueeze(-1)
+
+        v_ref = ref_ffn(x, w1, w3, w2)
+
+        torch.testing.assert_close(v_ref, v, atol=4.0e-3, rtol=4.0e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/llama_stack/providers/inline/meta_reference/inference/quantization/hadamard_utils.py
+++ b/llama_stack/providers/inline/meta_reference/inference/quantization/hadamard_utils.py
@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import math
+import re
+
+import torch
+from torch import nn
+
+
+def hadamard_transform(x: torch.Tensor) -> torch.Tensor:
+    """Hadamard transform.
+
+    This function performs the Hadamard transform on the input tensor 'x'.
+    The Hadamard transform is a linear transformation that multiplies the input
+    tensor by the Hadamard matrix of dimension n x n, where n is the size of
+    the last dimension of the input tensor.
+    """
+    *_, n = x.shape
+    m = int(math.log2(n))
+    assert n == 1 << m, "n must be a power of 2"
+    x = x[..., None]
+    inv_sqrt2 = 0.5**0.5
+    for _ in range(m):
+        top = x[..., ::2, :] + x[..., 1::2, :]
+        bot = x[..., ::2, :] - x[..., 1::2, :]
+        x = torch.cat((top, bot), dim=-1)
+        x *= inv_sqrt2
+    res = x.squeeze(-2)
+    return res
+
+
+class HadamardModule(torch.nn.Module):
+    """A module that applies the Hadamard transform to the input tensor.
+
+    Args:
+        group_size: The size of the groups that the input tensor will be divided into
+            before applying the Hadamard transform.
+    """
+
+    def __init__(self, group_size: int) -> None:
+        super().__init__()
+        self.group_size = group_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        reshape_back = False
+        orig_shape = x.shape
+        if self.group_size != x.shape[-1]:
+            reshape_back = True
+            x = x.reshape(-1, x.shape[-1] // self.group_size, self.group_size)
+        x = hadamard_transform(x)
+        if reshape_back:
+            x = x.reshape(orig_shape)
+        return x
+
+
+def add_hadamard_transform_for_spinquant(
+    model: torch.nn.Module, prefix: str = ""
+) -> None:
+    """
+    Adds a Hadamard transform to the last linear layer of each feedforward network (FFN) in the model.
+    This function recursively traverses the model's children and looks for layers that match the pattern
+    "layers.<digit>.feed_forward.w2", where <digit> is one or more digits. When such a layer is found,
+    it is replaced with a new sequential module that consists of a HadamardModule followed by the original
+    layer. The HadamardModule applies the Hadamard transform to the input tensor.
+
+    See `SpinQuant <https://arxiv.org/abs/2405.16406>_` paper for more details.
+
+    Args:
+        model: An instance of 'torch.nn.Module' (e.g., Transformer model).
+        prefix: A string prefix to add to the full name of each child module.
+
+    Returns:
+        None
+    """
+
+    pattern_last_linear_ffn = r"layers.\d+.feed_forward.w2"
+    for module_name, module in model.named_children():
+        child_full_name = prefix + "." + module_name
+        if re.search(pattern_last_linear_ffn, child_full_name):
+            new_module = nn.Sequential(
+                HadamardModule(group_size=module.in_features), module
+            )
+            del module
+            setattr(model, module_name, new_module)
+        else:
+            add_hadamard_transform_for_spinquant(
+                module, (prefix + "." if prefix else prefix) + module_name
+            )
--- a/llama_stack/providers/inline/meta_reference/inference/quantization/loader.py
+++ b/llama_stack/providers/inline/meta_reference/inference/quantization/loader.py
@ -0,0 +1,339 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import os
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_region
+
+from llama_models.datatypes import CheckpointQuantizationFormat
+
+from llama_models.llama3.api.args import ModelArgs
+from llama_models.llama3.reference_impl.model import Transformer, TransformerBlock
+from llama_models.sku_list import resolve_model
+from termcolor import cprint
+from torch import nn, Tensor
+
+from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear
+
+from llama_stack.apis.inference import QuantizationType
+
+from llama_stack.providers.inline.meta_reference.inference.config import (
+    MetaReferenceQuantizedInferenceConfig,
+)
+
+
+def swiglu_wrapper(
+    self,
+    x: Tensor,
+):
+    from .fp8_impls import ffn_swiglu
+
+    out = ffn_swiglu(x, self.w1.weight, self.w3.weight, self.w2.weight)
+    return reduce_from_model_parallel_region(out)
+
+
+def convert_to_fp8_quantized_model(
+    model: Transformer,
+    config: MetaReferenceQuantizedInferenceConfig,
+    checkpoint_dir: str,
+    fp8_activation_scale_ub: Optional[float] = 1200.0,
+) -> Transformer:
+    if config.quantization.type == QuantizationType.bf16.value:
+        return model
+
+    elif config.quantization.type != QuantizationType.fp8.value:
+        raise ValueError("Only FP8 quantization is supported")
+
+    from .fp8_impls import Fp8ScaledWeights, load_fp8, quantize_fp8
+
+    llama_model = resolve_model(config.model)
+    assert llama_model is not None, f"Model {config.model} not found"
+
+    # Move weights to GPU with quantization
+    if llama_model.quantization_format == CheckpointQuantizationFormat.fp8_mixed.value:
+        cprint("Loading fp8 scales...", "yellow")
+        fp8_scales_path = os.path.join(
+            checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt"
+        )
+        assert os.path.isfile(
+            fp8_scales_path
+        ), f"fp8_scales_path not found for rank {get_model_parallel_rank()}"
+        fp8_scales = torch.load(fp8_scales_path, weights_only=True)
+
+        for block in model.layers:
+            if isinstance(block, TransformerBlock):
+                if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
+                    continue
+
+                block.feed_forward.forward = swiglu_wrapper.__get__(block.feed_forward)
+                for key in ("w1", "w3", "w2"):
+                    param = getattr(block.feed_forward, key)
+                    param.weight = load_fp8(
+                        param.weight,
+                        fp8_scales[
+                            f"{block.layer_id}_feed_forward.{key}_{get_model_parallel_rank()}"
+                        ],
+                        fp8_activation_scale_ub,
+                    )
+    else:
+        cprint("Quantizing fp8 weights from bf16...", "yellow")
+        for block in model.layers:
+            if isinstance(block, TransformerBlock):
+                if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
+                    continue
+                block.feed_forward.forward = swiglu_wrapper.__get__(block.feed_forward)
+                for key in ("w1", "w3", "w2"):
+                    param = getattr(block.feed_forward, key)
+                    param.weight = quantize_fp8(
+                        param.weight,
+                        fp8_activation_scale_ub,
+                        output_device=torch.device("cuda"),
+                    )
+
+    for _, parameter in model.named_parameters():
+        if not isinstance(parameter, Fp8ScaledWeights):
+            parameter.data = parameter.to(device="cuda")
+    return model
+
+
+class Int8DynActInt4WeightLinearLoRA(Int8DynActInt4WeightLinear):
+    """
+    Int8DynActInt4WeightLinear with LoRA adaptor.
+
+    Args:
+        in_features: Number of input features.
+        out_features: Number of output features.
+        bias: Whether to use bias.
+        device: Device to use.
+        group_size: Group size for quantization.
+        precision: Precision of quantization.
+        scales_precision: Precision of scales.
+        lora_rank: Rank of LoRA adaptor.
+        lora_scale: Scale of LoRA adaptor.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias=False,
+        device=None,
+        # quantization parameters
+        group_size: int = 256,
+        precision: torch.dtype = torch.float32,
+        scales_precision: torch.dtype = torch.float32,
+        # LoRA parameters
+        lora_rank: Optional[int] = None,
+        lora_scale: Optional[float] = None,
+    ) -> None:
+        super().__init__(
+            in_features,
+            out_features,
+            bias=bias,
+            device=device,
+            groupsize=group_size,
+            precision=precision,
+            scales_precision=scales_precision,
+        )
+        if lora_rank is not None:
+            assert lora_scale is not None, "Please specify lora scale for LoRA."
+            # Low-rank adaptation. See paper for more details: https://arxiv.org/abs/2106.09685
+            self.adaptor = nn.Sequential()
+            self.adaptor.add_module("A", nn.Linear(in_features, lora_rank, bias=False))
+            self.adaptor.add_module("B", nn.Linear(lora_rank, out_features, bias=False))
+            self.lora_scale = lora_scale
+        else:
+            self.adaptor = None
+            self.lora_scale = None
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        local_metadata: Dict[str, Any],
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ) -> None:
+        """A hook to load the quantized weights from the state dict."""
+        if prefix + "zeros" not in state_dict:
+            # Zero-point may not be saved in the state dict. In this case, we assume it's zero.
+            assert prefix + "scales" in state_dict
+            state_dict[prefix + "zeros"] = torch.zeros_like(
+                state_dict[prefix + "scales"]
+            )
+
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        module_out = super().forward(input_)
+        if self.adaptor is not None:
+            adaptor_out = self.adaptor(input_) * self.lora_scale
+            return module_out + adaptor_out
+        return module_out
+
+
+class Int8WeightEmbedding(torch.nn.Embedding):
+    """An embedding layer to load int8 weights.
+
+    Args:
+        num_embeddings: Number of embeddings.
+        embedding_dim: Embedding dimension.
+        padding_idx: Padding index.
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: int,
+        device=None,
+    ) -> None:
+        super().__init__(num_embeddings, embedding_dim, padding_idx, device=device)
+
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        local_metadata: Dict[str, Any],
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ) -> None:
+        """A hook to load the quantized embedding weight and scales from the state dict."""
+        weights = state_dict.pop(prefix + "weight")
+        scales = state_dict.pop(prefix + "scales")
+        state_dict[prefix + "weight"] = weights * scales
+
+
+class Int8WeightLinear(torch.nn.Linear):
+    """A linear layer to load int8 weights.
+
+    Args:
+        in_features: Number of input features.
+        out_features: Number of output features.
+        bias: Whether to use bias.
+    """
+
+    def __init__(
+        self, in_features: int, out_features: int, bias: bool = True, device=None
+    ) -> None:
+        super().__init__(in_features, out_features, bias, device=device)
+
+        self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        local_metadata: Dict[str, Any],
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ) -> None:
+        """A hook to load the quantized linear weight and scales from the state dict."""
+        weights = state_dict.pop(prefix + "weight")
+        scales = state_dict.pop(prefix + "scales")
+        state_dict[prefix + "weight"] = weights * scales
+
+
+def _prepare_model_int4_weight_int8_dynamic_activation(
+    model: torch.nn.Module,
+    group_size: int,
+    lora_rank: Optional[int],
+    lora_scale: Optional[float],
+):
+    """Prepare the model for int4 weight and int8 dynamic activation quantization.
+
+    Note that the weights of embedding and output layers are quantized to int8.
+    """
+    device = None
+    for module_name, module in model.named_children():
+        if module_name == "output":
+            quantized_module = Int8WeightLinear(
+                in_features=module.in_features,
+                out_features=module.out_features,
+                bias=module.bias,
+                device=device,
+            )
+            del module
+            setattr(model, module_name, quantized_module)
+        elif module_name == "tok_embeddings":
+            quantized_module = Int8WeightEmbedding(
+                num_embeddings=module.num_embeddings,
+                embedding_dim=module.embedding_dim,
+                padding_idx=module.padding_idx,
+                device=device,
+            )
+            del module
+            setattr(model, module_name, quantized_module)
+        elif isinstance(module, (ColumnParallelLinear, RowParallelLinear, nn.Linear)):
+            quantized_module = Int8DynActInt4WeightLinearLoRA(
+                in_features=module.in_features,
+                out_features=module.out_features,
+                bias=False,
+                group_size=group_size,
+                lora_rank=lora_rank,
+                lora_scale=lora_scale,
+                device=device,
+            )
+            del module
+            setattr(model, module_name, quantized_module)
+        else:
+            _prepare_model_int4_weight_int8_dynamic_activation(
+                module, group_size, lora_rank, lora_scale
+            )
+
+    return model
+
+
+def convert_to_int4_quantized_model(
+    model: Transformer,
+    model_args: ModelArgs,
+    config: MetaReferenceQuantizedInferenceConfig,
+) -> Transformer:
+    """Convert the model to int4 quantized model."""
+
+    if model_args.quantization_args is None:
+        raise ValueError("'quantization_args' cannot be None. Please specify it.")
+
+    quantization_args = model_args.quantization_args
+
+    if quantization_args.scheme.value != "int4_weight_int8_dynamic_activation":
+        raise NotImplementedError(
+            "Only int4 quantization with 'int4_weight_int8_dynamic_activation' scheme is supported."
+        )
+
+    group_size = model_args.quantization_args.group_size
+    if group_size is None:
+        raise ValueError(
+            "'group_size' cannot be None in 'quantization_args'. Please specify it."
+        )
+
+    if model_args.lora_args is None:
+        # Certain quantized models (e.g., SpinQuant) may not have LoRA.
+        lora_rank = None
+        lora_scale = None
+    else:
+        lora_rank = model_args.lora_args.rank
+        lora_scale = model_args.lora_args.scale
+
+    _prepare_model_int4_weight_int8_dynamic_activation(
+        model, group_size, lora_rank, lora_scale
+    )
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    return model.to(device)
--- a/llama_stack/providers/inline/meta_reference/inference/quantization/scripts/init.py
+++ b/llama_stack/providers/inline/meta_reference/inference/quantization/scripts/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/inline/meta_reference/inference/quantization/scripts/build_conda.sh
+++ b/llama_stack/providers/inline/meta_reference/inference/quantization/scripts/build_conda.sh
@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+if [[ $# -ne 1 ]]; then
+    echo "Error: Please provide the name of CONDA environment you wish to create"
+    exit 1
+fi
+
+ENV_NAME=$1
+
+set -eu
+eval "$(conda shell.bash hook)"
+
+echo "Will build env (or overwrite) named '$ENV_NAME'"
+
+set -x
+
+run_build() {
+    # Set up the conda environment
+    yes | conda remove --name $ENV_NAME --all
+    yes | conda create -n $ENV_NAME python=3.10
+    conda activate $ENV_NAME
+
+    # PT nightly
+    pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+
+    # install dependencies for `llama-agentic-system`
+    pip install -r fp8_requirements.txt
+}
+
+run_build
--- a/llama_stack/providers/inline/meta_reference/inference/quantization/scripts/quantize_checkpoint.py
+++ b/llama_stack/providers/inline/meta_reference/inference/quantization/scripts/quantize_checkpoint.py
@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import json
+import os
+import shutil
+import sys
+from pathlib import Path
+from typing import Optional
+
+import fire
+
+import torch
+from fairscale.nn.model_parallel.initialize import (
+    get_model_parallel_rank,
+    initialize_model_parallel,
+    model_parallel_is_initialized,
+)
+from fp8.fp8_impls import FfnQuantizeMode, quantize_fp8
+
+from llama.model import ModelArgs, Transformer, TransformerBlock
+from llama.tokenizer import Tokenizer
+from torch.nn.parameter import Parameter
+
+
+def main(
+    ckpt_dir: str,
+    tokenizer_path: str,
+    quantized_ckpt_dir: str,
+    max_seq_len: Optional[int] = 512,
+    max_batch_size: Optional[int] = 4,
+    model_parallel_size: Optional[int] = None,
+    ffn_quantize_mode: Optional[FfnQuantizeMode] = FfnQuantizeMode.FP8_ROWWISE,
+    fp8_activation_scale_ub: Optional[float] = 1200.0,
+    seed: int = 1,
+):
+    """ """
+    if not os.path.exists(quantized_ckpt_dir):
+        os.makedirs(quantized_ckpt_dir)
+        shutil.copy(
+            os.path.join(ckpt_dir, "params.json"),
+            os.path.join(quantized_ckpt_dir, "params.json"),
+        )
+        shutil.copy(
+            os.path.join(ckpt_dir, "tokenizer.model"),
+            os.path.join(quantized_ckpt_dir, "tokenizer.model"),
+        )
+
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group("nccl")
+        if not model_parallel_is_initialized():
+            if model_parallel_size is None:
+                model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
+            initialize_model_parallel(model_parallel_size)
+
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        torch.cuda.set_device(local_rank)
+
+        # seed must be the same in all processes
+        torch.manual_seed(seed)
+
+        if local_rank > 0:
+            sys.stdout = open(os.devnull, "w")
+
+        checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
+        assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
+        assert model_parallel_size == len(
+            checkpoints
+        ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
+        ckpt_path = checkpoints[get_model_parallel_rank()]
+        checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        with open(Path(ckpt_dir) / "params.json", "r") as f:
+            params = json.loads(f.read())
+
+        model_args: ModelArgs = ModelArgs(
+            max_seq_len=max_seq_len,
+            max_batch_size=max_batch_size,
+            **params,
+        )
+        tokenizer = Tokenizer(model_path=tokenizer_path)
+        assert (
+            model_args.vocab_size == tokenizer.n_words
+        ), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
+
+        # load on CPU in bf16 so that fp8 conversion does not find an unexpected (fp32, e.g.) datatype
+        torch.set_default_tensor_type(torch.BFloat16Tensor)
+
+        model = Transformer(model_args)
+        model.load_state_dict(checkpoint, strict=False)
+
+        if torch.cuda.is_bf16_supported():
+            torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
+        else:
+            torch.set_default_tensor_type(torch.cuda.HalfTensor)
+
+        print(ckpt_path)
+        assert (
+            quantized_ckpt_dir is not None
+        ), "QUantized checkpoint directory should not be None"
+        fp8_scales = {}
+        for block in model.layers:
+            if isinstance(block, TransformerBlock):
+                if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
+                    continue
+
+                fp8_weight = quantize_fp8(
+                    block.feed_forward.w1.weight,
+                    fp8_activation_scale_ub,
+                    ffn_quantize_mode,
+                    output_device=torch.device("cpu"),
+                )
+                with torch.inference_mode():
+                    block.feed_forward.w1.weight = Parameter(fp8_weight.weight)
+                fp8_scales[
+                    f"{block.layer_id}_feed_forward.w1_{get_model_parallel_rank()}"
+                ] = fp8_weight.scale
+
+                fp8_weight = quantize_fp8(
+                    block.feed_forward.w3.weight,
+                    fp8_activation_scale_ub,
+                    ffn_quantize_mode,
+                    output_device=torch.device("cpu"),
+                )
+                with torch.inference_mode():
+                    block.feed_forward.w3.weight = Parameter(fp8_weight.weight)
+                fp8_scales[
+                    f"{block.layer_id}_feed_forward.w3_{get_model_parallel_rank()}"
+                ] = fp8_weight.scale
+
+                fp8_weight = quantize_fp8(
+                    block.feed_forward.w2.weight,
+                    fp8_activation_scale_ub,
+                    ffn_quantize_mode,
+                    output_device=torch.device("cpu"),
+                )
+                with torch.inference_mode():
+                    block.feed_forward.w2.weight = Parameter(fp8_weight.weight)
+                fp8_scales[
+                    f"{block.layer_id}_feed_forward.w2_{get_model_parallel_rank()}"
+                ] = fp8_weight.scale
+
+        fp8_scales_path = os.path.join(
+            quantized_ckpt_dir, f"fp8_scales_{get_model_parallel_rank()}.pt"
+        )
+        torch.save(fp8_scales, fp8_scales_path)
+
+        ckpt_path = os.path.join(
+            quantized_ckpt_dir,
+            "consolidated.{:02d}.pth".format(get_model_parallel_rank()),
+        )
+        torch.save(model.state_dict(), ckpt_path)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/llama_stack/providers/inline/meta_reference/inference/quantization/scripts/run_quantize_checkpoint.sh
+++ b/llama_stack/providers/inline/meta_reference/inference/quantization/scripts/run_quantize_checkpoint.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+set -x
+
+cd $(git rev-parse --show-toplevel)
+
+MASTER_HOST=$1
+RUN_ID=$2
+CKPT_DIR=$3
+QUANT_CKPT_DIR=$4
+TOKENIZER_PATH=$5
+NNODES=$6
+NPROC=$7
+
+echo $MASTER_HOST, $RUN_ID, $CKPT_DIR, $QUANT_CKPT_DIR
+
+NCCL_NET=Socket NCCL_SOCKET_IFNAME=eth TIKTOKEN_CACHE_DIR="" \
+  torchrun \
+   --nnodes=$NNODES --nproc_per_node=$NPROC \
+   --rdzv_id=$RUN_ID \
+   --rdzv_conf='timeout=120' \
+   --rdzv_backend=c10d \
+   --rdzv_endpoint="${MASTER_HOST}:29502" \
+   quantize_checkpoint.py $CKPT_DIR $TOKENIZER_PATH $QUANT_CKPT_DIR
--- a/llama_stack/providers/inline/meta_reference/memory/init.py
+++ b/llama_stack/providers/inline/meta_reference/memory/init.py
@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import FaissImplConfig
+
+
+async def get_provider_impl(config: FaissImplConfig, _deps):
+    from .faiss import FaissMemoryImpl
+
+    assert isinstance(
+        config, FaissImplConfig
+    ), f"Unexpected config type: {type(config)}"
+
+    impl = FaissMemoryImpl(config)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/meta_reference/memory/config.py
+++ b/llama_stack/providers/inline/meta_reference/memory/config.py
@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_models.schema_utils import json_schema_type
+from pydantic import BaseModel
+
+from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.providers.utils.kvstore.config import (
+    KVStoreConfig,
+    SqliteKVStoreConfig,
+)
+
+
+@json_schema_type
+class FaissImplConfig(BaseModel):
+    kvstore: KVStoreConfig = SqliteKVStoreConfig(
+        db_path=(RUNTIME_BASE_DIR / "faiss_store.db").as_posix()
+    )  # Uses SQLite config specific to FAISS storage
--- a/llama_stack/providers/inline/meta_reference/memory/faiss.py
+++ b/llama_stack/providers/inline/meta_reference/memory/faiss.py
@ -0,0 +1,141 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import logging
+
+from typing import Any, Dict, List, Optional
+
+import faiss
+import numpy as np
+from numpy.typing import NDArray
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+
+from llama_stack.apis.memory import *  # noqa: F403
+from llama_stack.providers.datatypes import MemoryBanksProtocolPrivate
+from llama_stack.providers.utils.kvstore import kvstore_impl
+
+from llama_stack.providers.utils.memory.vector_store import (
+    ALL_MINILM_L6_V2_DIMENSION,
+    BankWithIndex,
+    EmbeddingIndex,
+)
+from llama_stack.providers.utils.telemetry import tracing
+
+from .config import FaissImplConfig
+
+logger = logging.getLogger(__name__)
+
+MEMORY_BANKS_PREFIX = "memory_banks:"
+
+
+class FaissIndex(EmbeddingIndex):
+    id_by_index: Dict[int, str]
+    chunk_by_index: Dict[int, str]
+
+    def __init__(self, dimension: int):
+        self.index = faiss.IndexFlatL2(dimension)
+        self.id_by_index = {}
+        self.chunk_by_index = {}
+
+    @tracing.span(name="add_chunks")
+    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+        indexlen = len(self.id_by_index)
+        for i, chunk in enumerate(chunks):
+            self.chunk_by_index[indexlen + i] = chunk
+            self.id_by_index[indexlen + i] = chunk.document_id
+
+        self.index.add(np.array(embeddings).astype(np.float32))
+
+    async def query(
+        self, embedding: NDArray, k: int, score_threshold: float
+    ) -> QueryDocumentsResponse:
+        distances, indices = self.index.search(
+            embedding.reshape(1, -1).astype(np.float32), k
+        )
+
+        chunks = []
+        scores = []
+        for d, i in zip(distances[0], indices[0]):
+            if i < 0:
+                continue
+            chunks.append(self.chunk_by_index[int(i)])
+            scores.append(1.0 / float(d))
+
+        return QueryDocumentsResponse(chunks=chunks, scores=scores)
+
+
+class FaissMemoryImpl(Memory, MemoryBanksProtocolPrivate):
+    def __init__(self, config: FaissImplConfig) -> None:
+        self.config = config
+        self.cache = {}
+        self.kvstore = None
+
+    async def initialize(self) -> None:
+        self.kvstore = await kvstore_impl(self.config.kvstore)
+        # Load existing banks from kvstore
+        start_key = MEMORY_BANKS_PREFIX
+        end_key = f"{MEMORY_BANKS_PREFIX}\xff"
+        stored_banks = await self.kvstore.range(start_key, end_key)
+
+        for bank_data in stored_banks:
+            bank = VectorMemoryBankDef.model_validate_json(bank_data)
+            index = BankWithIndex(
+                bank=bank, index=FaissIndex(ALL_MINILM_L6_V2_DIMENSION)
+            )
+            self.cache[bank.identifier] = index
+
+    async def shutdown(self) -> None:
+        # Cleanup if needed
+        pass
+
+    async def register_memory_bank(
+        self,
+        memory_bank: MemoryBankDef,
+    ) -> None:
+        assert (
+            memory_bank.type == MemoryBankType.vector.value
+        ), f"Only vector banks are supported {memory_bank.type}"
+
+        # Store in kvstore
+        key = f"{MEMORY_BANKS_PREFIX}{memory_bank.identifier}"
+        await self.kvstore.set(
+            key=key,
+            value=memory_bank.json(),
+        )
+
+        # Store in cache
+        index = BankWithIndex(
+            bank=memory_bank, index=FaissIndex(ALL_MINILM_L6_V2_DIMENSION)
+        )
+        self.cache[memory_bank.identifier] = index
+
+    async def list_memory_banks(self) -> List[MemoryBankDef]:
+        return [i.bank for i in self.cache.values()]
+
+    async def insert_documents(
+        self,
+        bank_id: str,
+        documents: List[MemoryBankDocument],
+        ttl_seconds: Optional[int] = None,
+    ) -> None:
+        index = self.cache.get(bank_id)
+        if index is None:
+            raise ValueError(f"Bank {bank_id} not found")
+
+        await index.insert_documents(documents)
+
+    async def query_documents(
+        self,
+        bank_id: str,
+        query: InterleavedTextMedia,
+        params: Optional[Dict[str, Any]] = None,
+    ) -> QueryDocumentsResponse:
+        index = self.cache.get(bank_id)
+        if index is None:
+            raise ValueError(f"Bank {bank_id} not found")
+
+        return await index.query_documents(query, params)
--- a/llama_stack/providers/inline/meta_reference/memory/tests/test_faiss.py
+++ b/llama_stack/providers/inline/meta_reference/memory/tests/test_faiss.py
@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import tempfile
+
+import pytest
+from llama_stack.apis.memory import MemoryBankType, VectorMemoryBankDef
+from llama_stack.providers.inline.meta_reference.memory.config import FaissImplConfig
+
+from llama_stack.providers.inline.meta_reference.memory.faiss import FaissMemoryImpl
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
+
+
+class TestFaissMemoryImpl:
+    @pytest.fixture
+    def faiss_impl(self):
+        # Create a temporary SQLite database file
+        temp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
+        config = FaissImplConfig(kvstore=SqliteKVStoreConfig(db_path=temp_db.name))
+        return FaissMemoryImpl(config)
+
+    @pytest.mark.asyncio
+    async def test_initialize(self, faiss_impl):
+        # Test empty initialization
+        await faiss_impl.initialize()
+        assert len(faiss_impl.cache) == 0
+
+        # Test initialization with existing banks
+        bank = VectorMemoryBankDef(
+            identifier="test_bank",
+            type=MemoryBankType.vector.value,
+            embedding_model="all-MiniLM-L6-v2",
+            chunk_size_in_tokens=512,
+            overlap_size_in_tokens=64,
+        )
+
+        # Register a bank and reinitialize to test loading
+        await faiss_impl.register_memory_bank(bank)
+
+        # Create new instance to test initialization with existing data
+        new_impl = FaissMemoryImpl(faiss_impl.config)
+        await new_impl.initialize()
+
+        assert len(new_impl.cache) == 1
+        assert "test_bank" in new_impl.cache
+
+    @pytest.mark.asyncio
+    async def test_register_memory_bank(self, faiss_impl):
+        bank = VectorMemoryBankDef(
+            identifier="test_bank",
+            type=MemoryBankType.vector.value,
+            embedding_model="all-MiniLM-L6-v2",
+            chunk_size_in_tokens=512,
+            overlap_size_in_tokens=64,
+        )
+
+        await faiss_impl.initialize()
+        await faiss_impl.register_memory_bank(bank)
+
+        assert "test_bank" in faiss_impl.cache
+        assert faiss_impl.cache["test_bank"].bank == bank
+
+        # Verify persistence
+        new_impl = FaissMemoryImpl(faiss_impl.config)
+        await new_impl.initialize()
+        assert "test_bank" in new_impl.cache
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
--- a/llama_stack/providers/inline/meta_reference/safety/init.py
+++ b/llama_stack/providers/inline/meta_reference/safety/init.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import LlamaGuardShieldConfig, SafetyConfig  # noqa: F401
+
+
+async def get_provider_impl(config: SafetyConfig, deps):
+    from .safety import MetaReferenceSafetyImpl
+
+    assert isinstance(config, SafetyConfig), f"Unexpected config type: {type(config)}"
+
+    impl = MetaReferenceSafetyImpl(config, deps)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/meta_reference/safety/base.py
+++ b/llama_stack/providers/inline/meta_reference/safety/base.py
@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from typing import List
+
+from llama_models.llama3.api.datatypes import interleaved_text_media_as_str, Message
+from pydantic import BaseModel
+from llama_stack.apis.safety import *  # noqa: F403
+
+CANNED_RESPONSE_TEXT = "I can't answer that. Can I help with something else?"
+
+
+# TODO: clean this up; just remove this type completely
+class ShieldResponse(BaseModel):
+    is_violation: bool
+    violation_type: Optional[str] = None
+    violation_return_message: Optional[str] = None
+
+
+# TODO: this is a caller / agent concern
+class OnViolationAction(Enum):
+    IGNORE = 0
+    WARN = 1
+    RAISE = 2
+
+
+class ShieldBase(ABC):
+    def __init__(
+        self,
+        on_violation_action: OnViolationAction = OnViolationAction.RAISE,
+    ):
+        self.on_violation_action = on_violation_action
+
+    @abstractmethod
+    async def run(self, messages: List[Message]) -> ShieldResponse:
+        raise NotImplementedError()
+
+
+def message_content_as_str(message: Message) -> str:
+    return interleaved_text_media_as_str(message.content)
+
+
+class TextShield(ShieldBase):
+    def convert_messages_to_text(self, messages: List[Message]) -> str:
+        return "\n".join([message_content_as_str(m) for m in messages])
+
+    async def run(self, messages: List[Message]) -> ShieldResponse:
+        text = self.convert_messages_to_text(messages)
+        return await self.run_impl(text)
+
+    @abstractmethod
+    async def run_impl(self, text: str) -> ShieldResponse:
+        raise NotImplementedError()
--- a/llama_stack/providers/inline/meta_reference/safety/config.py
+++ b/llama_stack/providers/inline/meta_reference/safety/config.py
@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import Enum
+from typing import List, Optional
+
+from llama_models.sku_list import CoreModelId, safety_models
+
+from pydantic import BaseModel, field_validator
+
+
+class PromptGuardType(Enum):
+    injection = "injection"
+    jailbreak = "jailbreak"
+
+
+class LlamaGuardShieldConfig(BaseModel):
+    model: str = "Llama-Guard-3-1B"
+    excluded_categories: List[str] = []
+
+    @field_validator("model")
+    @classmethod
+    def validate_model(cls, model: str) -> str:
+        permitted_models = [
+            m.descriptor()
+            for m in safety_models()
+            if (
+                m.core_model_id
+                in {
+                    CoreModelId.llama_guard_3_8b,
+                    CoreModelId.llama_guard_3_1b,
+                    CoreModelId.llama_guard_3_11b_vision,
+                }
+            )
+        ]
+        if model not in permitted_models:
+            raise ValueError(
+                f"Invalid model: {model}. Must be one of {permitted_models}"
+            )
+        return model
+
+
+class SafetyConfig(BaseModel):
+    llama_guard_shield: Optional[LlamaGuardShieldConfig] = None
+    enable_prompt_guard: Optional[bool] = False
--- a/llama_stack/providers/inline/meta_reference/safety/llama_guard.py
+++ b/llama_stack/providers/inline/meta_reference/safety/llama_guard.py
@ -0,0 +1,268 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import re
+
+from string import Template
+from typing import List, Optional
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.apis.inference import *  # noqa: F403
+
+from .base import CANNED_RESPONSE_TEXT, OnViolationAction, ShieldBase, ShieldResponse
+
+
+SAFE_RESPONSE = "safe"
+_INSTANCE = None
+
+CAT_VIOLENT_CRIMES = "Violent Crimes"
+CAT_NON_VIOLENT_CRIMES = "Non-Violent Crimes"
+CAT_SEX_CRIMES = "Sex Crimes"
+CAT_CHILD_EXPLOITATION = "Child Exploitation"
+CAT_DEFAMATION = "Defamation"
+CAT_SPECIALIZED_ADVICE = "Specialized Advice"
+CAT_PRIVACY = "Privacy"
+CAT_INTELLECTUAL_PROPERTY = "Intellectual Property"
+CAT_INDISCRIMINATE_WEAPONS = "Indiscriminate Weapons"
+CAT_HATE = "Hate"
+CAT_SELF_HARM = "Self-Harm"
+CAT_SEXUAL_CONTENT = "Sexual Content"
+CAT_ELECTIONS = "Elections"
+CAT_CODE_INTERPRETER_ABUSE = "Code Interpreter Abuse"
+
+
+SAFETY_CATEGORIES_TO_CODE_MAP = {
+    CAT_VIOLENT_CRIMES: "S1",
+    CAT_NON_VIOLENT_CRIMES: "S2",
+    CAT_SEX_CRIMES: "S3",
+    CAT_CHILD_EXPLOITATION: "S4",
+    CAT_DEFAMATION: "S5",
+    CAT_SPECIALIZED_ADVICE: "S6",
+    CAT_PRIVACY: "S7",
+    CAT_INTELLECTUAL_PROPERTY: "S8",
+    CAT_INDISCRIMINATE_WEAPONS: "S9",
+    CAT_HATE: "S10",
+    CAT_SELF_HARM: "S11",
+    CAT_SEXUAL_CONTENT: "S12",
+    CAT_ELECTIONS: "S13",
+    CAT_CODE_INTERPRETER_ABUSE: "S14",
+}
+
+
+DEFAULT_LG_V3_SAFETY_CATEGORIES = [
+    CAT_VIOLENT_CRIMES,
+    CAT_NON_VIOLENT_CRIMES,
+    CAT_SEX_CRIMES,
+    CAT_CHILD_EXPLOITATION,
+    CAT_DEFAMATION,
+    CAT_SPECIALIZED_ADVICE,
+    CAT_PRIVACY,
+    CAT_INTELLECTUAL_PROPERTY,
+    CAT_INDISCRIMINATE_WEAPONS,
+    CAT_HATE,
+    CAT_SELF_HARM,
+    CAT_SEXUAL_CONTENT,
+    CAT_ELECTIONS,
+]
+
+
+MODEL_TO_SAFETY_CATEGORIES_MAP = {
+    CoreModelId.llama_guard_3_8b.value: (
+        DEFAULT_LG_V3_SAFETY_CATEGORIES + [CAT_CODE_INTERPRETER_ABUSE]
+    ),
+    CoreModelId.llama_guard_3_1b.value: DEFAULT_LG_V3_SAFETY_CATEGORIES,
+    CoreModelId.llama_guard_3_11b_vision.value: DEFAULT_LG_V3_SAFETY_CATEGORIES,
+}
+
+
+PROMPT_TASK = "Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories."
+
+SAFETY_CATEGORIES = """
+
+<BEGIN UNSAFE CONTENT CATEGORIES>
+$categories
+<END UNSAFE CONTENT CATEGORIES>
+"""
+
+PROMPT_CONVERSATION = """
+<BEGIN CONVERSATION>
+
+$conversations
+
+<END CONVERSATION>
+"""
+
+
+PROMPT_INSTRUCTIONS = """
+Provide your safety assessment for ONLY THE LAST $agent_type message in the above conversation:
+ - First line must read 'safe' or 'unsafe'.
+ - If unsafe, a second line must include a comma-separated list of violated categories."""
+
+
+PROMPT_TEMPLATE = Template(
+    f"{PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS}"
+)
+
+
+class LlamaGuardShield(ShieldBase):
+    def __init__(
+        self,
+        model: str,
+        inference_api: Inference,
+        excluded_categories: List[str] = None,
+        on_violation_action: OnViolationAction = OnViolationAction.RAISE,
+    ):
+        super().__init__(on_violation_action)
+
+        if excluded_categories is None:
+            excluded_categories = []
+
+        assert len(excluded_categories) == 0 or all(
+            x in SAFETY_CATEGORIES_TO_CODE_MAP.values() for x in excluded_categories
+        ), "Invalid categories in excluded categories. Expected format is ['S1', 'S2', ..]"
+
+        if model not in MODEL_TO_SAFETY_CATEGORIES_MAP:
+            raise ValueError(f"Unsupported model: {model}")
+
+        self.model = model
+        self.inference_api = inference_api
+        self.excluded_categories = excluded_categories
+
+    def check_unsafe_response(self, response: str) -> Optional[str]:
+        match = re.match(r"^unsafe\n(.*)$", response)
+        if match:
+            # extracts the unsafe code
+            extracted = match.group(1)
+            return extracted
+
+        return None
+
+    def get_safety_categories(self) -> List[str]:
+        excluded_categories = self.excluded_categories
+        if set(excluded_categories) == set(SAFETY_CATEGORIES_TO_CODE_MAP.values()):
+            excluded_categories = []
+
+        final_categories = []
+
+        all_categories = MODEL_TO_SAFETY_CATEGORIES_MAP[self.model]
+        for cat in all_categories:
+            cat_code = SAFETY_CATEGORIES_TO_CODE_MAP[cat]
+            if cat_code in excluded_categories:
+                continue
+            final_categories.append(f"{cat_code}: {cat}.")
+
+        return final_categories
+
+    def validate_messages(self, messages: List[Message]) -> None:
+        if len(messages) == 0:
+            raise ValueError("Messages must not be empty")
+        if messages[0].role != Role.user.value:
+            raise ValueError("Messages must start with user")
+
+        if len(messages) >= 2 and (
+            messages[0].role == Role.user.value and messages[1].role == Role.user.value
+        ):
+            messages = messages[1:]
+
+        for i in range(1, len(messages)):
+            if messages[i].role == messages[i - 1].role:
+                raise ValueError(
+                    f"Messages must alternate between user and assistant. Message {i} has the same role as message {i - 1}"
+                )
+        return messages
+
+    async def run(self, messages: List[Message]) -> ShieldResponse:
+        messages = self.validate_messages(messages)
+
+        if self.model == CoreModelId.llama_guard_3_11b_vision.value:
+            shield_input_message = self.build_vision_shield_input(messages)
+        else:
+            shield_input_message = self.build_text_shield_input(messages)
+
+        # TODO: llama-stack inference protocol has issues with non-streaming inference code
+        content = ""
+        async for chunk in await self.inference_api.chat_completion(
+            model=self.model,
+            messages=[shield_input_message],
+            stream=True,
+        ):
+            event = chunk.event
+            if event.event_type == ChatCompletionResponseEventType.progress:
+                assert isinstance(event.delta, str)
+                content += event.delta
+
+        content = content.strip()
+        shield_response = self.get_shield_response(content)
+        return shield_response
+
+    def build_text_shield_input(self, messages: List[Message]) -> UserMessage:
+        return UserMessage(content=self.build_prompt(messages))
+
+    def build_vision_shield_input(self, messages: List[Message]) -> UserMessage:
+        conversation = []
+        most_recent_img = None
+
+        for m in messages[::-1]:
+            if isinstance(m.content, str):
+                conversation.append(m)
+            elif isinstance(m.content, ImageMedia):
+                if most_recent_img is None and m.role == Role.user.value:
+                    most_recent_img = m.content
+                    conversation.append(m)
+            elif isinstance(m.content, list):
+                content = []
+                for c in m.content:
+                    if isinstance(c, str):
+                        content.append(c)
+                    elif isinstance(c, ImageMedia):
+                        if most_recent_img is None and m.role == Role.user.value:
+                            most_recent_img = c
+                            content.append(c)
+                    else:
+                        raise ValueError(f"Unknown content type: {c}")
+
+                conversation.append(UserMessage(content=content))
+            else:
+                raise ValueError(f"Unknown content type: {m.content}")
+
+        prompt = []
+        if most_recent_img is not None:
+            prompt.append(most_recent_img)
+        prompt.append(self.build_prompt(conversation[::-1]))
+
+        return UserMessage(content=prompt)
+
+    def build_prompt(self, messages: List[Message]) -> str:
+        categories = self.get_safety_categories()
+        categories_str = "\n".join(categories)
+        conversations_str = "\n\n".join(
+            [
+                f"{m.role.capitalize()}: {interleaved_text_media_as_str(m.content)}"
+                for m in messages
+            ]
+        )
+        return PROMPT_TEMPLATE.substitute(
+            agent_type=messages[-1].role.capitalize(),
+            categories=categories_str,
+            conversations=conversations_str,
+        )
+
+    def get_shield_response(self, response: str) -> ShieldResponse:
+        response = response.strip()
+        if response == SAFE_RESPONSE:
+            return ShieldResponse(is_violation=False)
+        unsafe_code = self.check_unsafe_response(response)
+        if unsafe_code:
+            unsafe_code_list = unsafe_code.split(",")
+            if set(unsafe_code_list).issubset(set(self.excluded_categories)):
+                return ShieldResponse(is_violation=False)
+            return ShieldResponse(
+                is_violation=True,
+                violation_type=unsafe_code,
+                violation_return_message=CANNED_RESPONSE_TEXT,
+            )
+
+        raise ValueError(f"Unexpected response: {response}")
--- a/llama_stack/providers/inline/meta_reference/safety/prompt_guard.py
+++ b/llama_stack/providers/inline/meta_reference/safety/prompt_guard.py
@ -0,0 +1,145 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import auto, Enum
+from typing import List
+
+import torch
+
+from llama_models.llama3.api.datatypes import Message
+from termcolor import cprint
+
+from .base import message_content_as_str, OnViolationAction, ShieldResponse, TextShield
+
+
+class PromptGuardShield(TextShield):
+    class Mode(Enum):
+        INJECTION = auto()
+        JAILBREAK = auto()
+
+    _instances = {}
+    _model_cache = None
+
+    @staticmethod
+    def instance(
+        model_dir: str,
+        threshold: float = 0.9,
+        temperature: float = 1.0,
+        mode: "PromptGuardShield.Mode" = Mode.JAILBREAK,
+        on_violation_action=OnViolationAction.RAISE,
+    ) -> "PromptGuardShield":
+        action_value = on_violation_action.value
+        key = (model_dir, threshold, temperature, mode, action_value)
+        if key not in PromptGuardShield._instances:
+            PromptGuardShield._instances[key] = PromptGuardShield(
+                model_dir=model_dir,
+                threshold=threshold,
+                temperature=temperature,
+                mode=mode,
+                on_violation_action=on_violation_action,
+            )
+        return PromptGuardShield._instances[key]
+
+    def __init__(
+        self,
+        model_dir: str,
+        threshold: float = 0.9,
+        temperature: float = 1.0,
+        mode: "PromptGuardShield.Mode" = Mode.JAILBREAK,
+        on_violation_action: OnViolationAction = OnViolationAction.RAISE,
+    ):
+        super().__init__(on_violation_action)
+        assert (
+            model_dir is not None
+        ), "Must provide a model directory for prompt injection shield"
+        if temperature <= 0:
+            raise ValueError("Temperature must be greater than 0")
+        self.device = "cuda"
+        if PromptGuardShield._model_cache is None:
+            from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+            # load model and tokenizer
+            tokenizer = AutoTokenizer.from_pretrained(model_dir)
+            model = AutoModelForSequenceClassification.from_pretrained(
+                model_dir, device_map=self.device
+            )
+            PromptGuardShield._model_cache = (tokenizer, model)
+
+        self.tokenizer, self.model = PromptGuardShield._model_cache
+        self.temperature = temperature
+        self.threshold = threshold
+        self.mode = mode
+
+    def convert_messages_to_text(self, messages: List[Message]) -> str:
+        return message_content_as_str(messages[-1])
+
+    async def run_impl(self, text: str) -> ShieldResponse:
+        # run model on messages and return response
+        inputs = self.tokenizer(text, return_tensors="pt")
+        inputs = {name: tensor.to(self.model.device) for name, tensor in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        logits = outputs[0]
+        probabilities = torch.softmax(logits / self.temperature, dim=-1)
+        score_embedded = probabilities[0, 1].item()
+        score_malicious = probabilities[0, 2].item()
+        cprint(
+            f"Ran PromptGuardShield and got Scores: Embedded: {score_embedded}, Malicious: {score_malicious}",
+            color="magenta",
+        )
+
+        if self.mode == self.Mode.INJECTION and (
+            score_embedded + score_malicious > self.threshold
+        ):
+            return ShieldResponse(
+                is_violation=True,
+                violation_type=f"prompt_injection:embedded={score_embedded},malicious={score_malicious}",
+                violation_return_message="Sorry, I cannot do this.",
+            )
+        elif self.mode == self.Mode.JAILBREAK and score_malicious > self.threshold:
+            return ShieldResponse(
+                is_violation=True,
+                violation_type=f"prompt_injection:malicious={score_malicious}",
+                violation_return_message="Sorry, I cannot do this.",
+            )
+
+        return ShieldResponse(
+            is_violation=False,
+        )
+
+
+class JailbreakShield(PromptGuardShield):
+    def __init__(
+        self,
+        model_dir: str,
+        threshold: float = 0.9,
+        temperature: float = 1.0,
+        on_violation_action: OnViolationAction = OnViolationAction.RAISE,
+    ):
+        super().__init__(
+            model_dir=model_dir,
+            threshold=threshold,
+            temperature=temperature,
+            mode=PromptGuardShield.Mode.JAILBREAK,
+            on_violation_action=on_violation_action,
+        )
+
+
+class InjectionShield(PromptGuardShield):
+    def __init__(
+        self,
+        model_dir: str,
+        threshold: float = 0.9,
+        temperature: float = 1.0,
+        on_violation_action: OnViolationAction = OnViolationAction.RAISE,
+    ):
+        super().__init__(
+            model_dir=model_dir,
+            threshold=threshold,
+            temperature=temperature,
+            mode=PromptGuardShield.Mode.INJECTION,
+            on_violation_action=on_violation_action,
+        )
--- a/llama_stack/providers/inline/meta_reference/safety/safety.py
+++ b/llama_stack/providers/inline/meta_reference/safety/safety.py
@ -0,0 +1,112 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, List
+
+from llama_stack.distribution.utils.model_utils import model_local_dir
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.apis.safety import *  # noqa: F403
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.distribution.datatypes import Api
+
+from llama_stack.providers.datatypes import ShieldsProtocolPrivate
+
+from .base import OnViolationAction, ShieldBase
+from .config import SafetyConfig
+from .llama_guard import LlamaGuardShield
+from .prompt_guard import InjectionShield, JailbreakShield, PromptGuardShield
+
+
+PROMPT_GUARD_MODEL = "Prompt-Guard-86M"
+
+
+class MetaReferenceSafetyImpl(Safety, ShieldsProtocolPrivate):
+    def __init__(self, config: SafetyConfig, deps) -> None:
+        self.config = config
+        self.inference_api = deps[Api.inference]
+
+        self.available_shields = []
+        if config.llama_guard_shield:
+            self.available_shields.append(ShieldType.llama_guard.value)
+        if config.enable_prompt_guard:
+            self.available_shields.append(ShieldType.prompt_guard.value)
+
+    async def initialize(self) -> None:
+        if self.config.enable_prompt_guard:
+            model_dir = model_local_dir(PROMPT_GUARD_MODEL)
+            _ = PromptGuardShield.instance(model_dir)
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def register_shield(self, shield: ShieldDef) -> None:
+        raise ValueError("Registering dynamic shields is not supported")
+
+    async def list_shields(self) -> List[ShieldDef]:
+        return [
+            ShieldDef(
+                identifier=shield_type,
+                shield_type=shield_type,
+                params={},
+            )
+            for shield_type in self.available_shields
+        ]
+
+    async def run_shield(
+        self,
+        shield_type: str,
+        messages: List[Message],
+        params: Dict[str, Any] = None,
+    ) -> RunShieldResponse:
+        shield_def = await self.shield_store.get_shield(shield_type)
+        if not shield_def:
+            raise ValueError(f"Unknown shield {shield_type}")
+
+        shield = self.get_shield_impl(shield_def)
+
+        messages = messages.copy()
+        # some shields like llama-guard require the first message to be a user message
+        # since this might be a tool call, first role might not be user
+        if len(messages) > 0 and messages[0].role != Role.user.value:
+            messages[0] = UserMessage(content=messages[0].content)
+
+        # TODO: we can refactor ShieldBase, etc. to be inline with the API types
+        res = await shield.run(messages)
+        violation = None
+        if res.is_violation and shield.on_violation_action != OnViolationAction.IGNORE:
+            violation = SafetyViolation(
+                violation_level=(
+                    ViolationLevel.ERROR
+                    if shield.on_violation_action == OnViolationAction.RAISE
+                    else ViolationLevel.WARN
+                ),
+                user_message=res.violation_return_message,
+                metadata={
+                    "violation_type": res.violation_type,
+                },
+            )
+
+        return RunShieldResponse(violation=violation)
+
+    def get_shield_impl(self, shield: ShieldDef) -> ShieldBase:
+        if shield.shield_type == ShieldType.llama_guard.value:
+            cfg = self.config.llama_guard_shield
+            return LlamaGuardShield(
+                model=cfg.model,
+                inference_api=self.inference_api,
+                excluded_categories=cfg.excluded_categories,
+            )
+        elif shield.shield_type == ShieldType.prompt_guard.value:
+            model_dir = model_local_dir(PROMPT_GUARD_MODEL)
+            subtype = shield.params.get("prompt_guard_type", "injection")
+            if subtype == "injection":
+                return InjectionShield.instance(model_dir)
+            elif subtype == "jailbreak":
+                return JailbreakShield.instance(model_dir)
+            else:
+                raise ValueError(f"Unknown prompt guard type: {subtype}")
+        else:
+            raise ValueError(f"Unknown shield type: {shield.shield_type}")
--- a/llama_stack/providers/inline/meta_reference/scoring/init.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/init.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Dict
+
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
+from .config import MetaReferenceScoringConfig
+
+
+async def get_provider_impl(
+    config: MetaReferenceScoringConfig,
+    deps: Dict[Api, ProviderSpec],
+):
+    from .scoring import MetaReferenceScoringImpl
+
+    impl = MetaReferenceScoringImpl(
+        config, deps[Api.datasetio], deps[Api.datasets], deps[Api.inference]
+    )
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/meta_reference/scoring/config.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/config.py
@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.scoring import *  # noqa: F401, F403
+
+
+class MetaReferenceScoringConfig(BaseModel): ...
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring.py
@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import List
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.apis.scoring import *  # noqa: F403
+from llama_stack.apis.scoring_functions import *  # noqa: F403
+from llama_stack.apis.common.type_system import *  # noqa: F403
+from llama_stack.apis.datasetio import *  # noqa: F403
+from llama_stack.apis.datasets import *  # noqa: F403
+from llama_stack.apis.inference.inference import Inference
+from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.equality_scoring_fn import (
+    EqualityScoringFn,
+)
+
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.llm_as_judge_scoring_fn import (
+    LlmAsJudgeScoringFn,
+)
+
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.subset_of_scoring_fn import (
+    SubsetOfScoringFn,
+)
+
+from .config import MetaReferenceScoringConfig
+
+FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn]
+
+LLM_JUDGE_FNS = [LlmAsJudgeScoringFn]
+
+
+class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
+    def __init__(
+        self,
+        config: MetaReferenceScoringConfig,
+        datasetio_api: DatasetIO,
+        datasets_api: Datasets,
+        inference_api: Inference,
+    ) -> None:
+        self.config = config
+        self.datasetio_api = datasetio_api
+        self.datasets_api = datasets_api
+        self.inference_api = inference_api
+        self.scoring_fn_id_impls = {}
+
+    async def initialize(self) -> None:
+        for x in FIXED_FNS:
+            impl = x()
+            for fn_defs in impl.get_supported_scoring_fn_defs():
+                self.scoring_fn_id_impls[fn_defs.identifier] = impl
+        for x in LLM_JUDGE_FNS:
+            impl = x(inference_api=self.inference_api)
+            for fn_defs in impl.get_supported_scoring_fn_defs():
+                self.scoring_fn_id_impls[fn_defs.identifier] = impl
+                self.llm_as_judge_fn = impl
+
+    async def shutdown(self) -> None: ...
+
+    async def list_scoring_functions(self) -> List[ScoringFnDef]:
+        scoring_fn_defs_list = [
+            fn_def
+            for impl in self.scoring_fn_id_impls.values()
+            for fn_def in impl.get_supported_scoring_fn_defs()
+        ]
+
+        for f in scoring_fn_defs_list:
+            assert f.identifier.startswith(
+                "meta-reference"
+            ), "All meta-reference scoring fn must have identifier prefixed with 'meta-reference'! "
+
+        return scoring_fn_defs_list
+
+    async def register_scoring_function(self, function_def: ScoringFnDef) -> None:
+        self.llm_as_judge_fn.register_scoring_fn_def(function_def)
+        self.scoring_fn_id_impls[function_def.identifier] = self.llm_as_judge_fn
+
+    async def validate_scoring_input_dataset_schema(self, dataset_id: str) -> None:
+        dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
+        if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
+            raise ValueError(
+                f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset."
+            )
+
+        for required_column in ["generated_answer", "expected_answer", "input_query"]:
+            if required_column not in dataset_def.dataset_schema:
+                raise ValueError(
+                    f"Dataset {dataset_id} does not have a '{required_column}' column."
+                )
+            if dataset_def.dataset_schema[required_column].type != "string":
+                raise ValueError(
+                    f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'."
+                )
+
+    async def score_batch(
+        self,
+        dataset_id: str,
+        scoring_functions: List[str],
+        save_results_dataset: bool = False,
+    ) -> ScoreBatchResponse:
+        await self.validate_scoring_input_dataset_schema(dataset_id=dataset_id)
+        all_rows = await self.datasetio_api.get_rows_paginated(
+            dataset_id=dataset_id,
+            rows_in_page=-1,
+        )
+        res = await self.score(
+            input_rows=all_rows.rows, scoring_functions=scoring_functions
+        )
+        if save_results_dataset:
+            # TODO: persist and register dataset on to server for reading
+            # self.datasets_api.register_dataset()
+            raise NotImplementedError("Save results dataset not implemented yet")
+
+        return ScoreBatchResponse(
+            results=res.results,
+        )
+
+    async def score(
+        self, input_rows: List[Dict[str, Any]], scoring_functions: List[str]
+    ) -> ScoreResponse:
+        res = {}
+        for scoring_fn_id in scoring_functions:
+            if scoring_fn_id not in self.scoring_fn_id_impls:
+                raise ValueError(f"Scoring function {scoring_fn_id} is not supported.")
+            scoring_fn = self.scoring_fn_id_impls[scoring_fn_id]
+            score_results = await scoring_fn.score(input_rows, scoring_fn_id)
+            agg_results = await scoring_fn.aggregate(score_results)
+            res[scoring_fn_id] = ScoringResult(
+                score_rows=score_results,
+                aggregated_results=agg_results,
+            )
+
+        return ScoreResponse(
+            results=res,
+        )
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/init.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/base_scoring_fn.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/base_scoring_fn.py
@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
+from llama_stack.apis.scoring import *  # noqa: F401, F403
+
+
+class BaseScoringFn(ABC):
+    """
+    Base interface class for all meta-reference scoring_fns.
+    Each scoring_fn needs to implement the following methods:
+    - score_row(self, row)
+    - aggregate(self, scoring_fn_results)
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {}
+
+    def __str__(self) -> str:
+        return self.__class__.__name__
+
+    def get_supported_scoring_fn_defs(self) -> List[ScoringFnDef]:
+        return [x for x in self.supported_fn_defs_registry.values()]
+
+    def register_scoring_fn_def(self, scoring_fn_def: ScoringFnDef) -> None:
+        if scoring_fn_def.identifier in self.supported_fn_defs_registry:
+            raise ValueError(
+                f"Scoring function def with identifier {scoring_fn_def.identifier} already exists."
+            )
+        self.supported_fn_defs_registry[scoring_fn_def.identifier] = scoring_fn_def
+
+    @abstractmethod
+    async def score_row(
+        self, input_row: Dict[str, Any], scoring_fn_identifier: Optional[str] = None
+    ) -> ScoringResultRow:
+        raise NotImplementedError()
+
+    @abstractmethod
+    async def aggregate(
+        self, scoring_results: List[ScoringResultRow]
+    ) -> Dict[str, Any]:
+        raise NotImplementedError()
+
+    async def score(
+        self,
+        input_rows: List[Dict[str, Any]],
+        scoring_fn_identifier: Optional[str] = None,
+    ) -> List[ScoringResultRow]:
+        return [
+            await self.score_row(input_row, scoring_fn_identifier)
+            for input_row in input_rows
+        ]
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/common.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/common.py
@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from pathlib import Path
+from typing import Any, Dict, List
+
+from llama_stack.apis.scoring import ScoringResultRow
+
+FN_DEFS_PATH = Path(__file__).parent / "fn_defs"
+
+
+def aggregate_accuracy(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
+    num_correct = sum(result["score"] for result in scoring_results)
+    avg_score = num_correct / len(scoring_results)
+
+    return {
+        "accuracy": avg_score,
+        "num_correct": num_correct,
+        "num_total": len(scoring_results),
+    }
+
+
+def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
+    return {
+        "average": sum(
+            result["score"] for result in scoring_results if result["score"] is not None
+        )
+        / len([_ for _ in scoring_results if _["score"] is not None]),
+    }
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/equality_scoring_fn.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/equality_scoring_fn.py
@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import (
+    BaseScoringFn,
+)
+from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
+from llama_stack.apis.scoring import *  # noqa: F401, F403
+from llama_stack.apis.common.type_system import *  # noqa: F403
+
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
+    aggregate_accuracy,
+)
+
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.equality import (
+    equality,
+)
+
+
+class EqualityScoringFn(BaseScoringFn):
+    """
+    A scoring_fn that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise.
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            equality.identifier: equality,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = "equality",
+    ) -> ScoringResultRow:
+        assert "expected_answer" in input_row, "Expected answer not found in input row."
+        assert (
+            "generated_answer" in input_row
+        ), "Generated answer not found in input row."
+
+        expected_answer = input_row["expected_answer"]
+        generated_answer = input_row["generated_answer"]
+        score = 1.0 if expected_answer == generated_answer else 0.0
+        return {
+            "score": score,
+        }
+
+    async def aggregate(
+        self, scoring_results: List[ScoringResultRow]
+    ) -> Dict[str, Any]:
+        return aggregate_accuracy(scoring_results)
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/init.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/equality.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/equality.py
@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import ScoringFnDef
+
+
+equality = ScoringFnDef(
+    identifier="meta-reference::equality",
+    description="Returns 1.0 if the input is equal to the target, 0.0 otherwise.",
+    parameters=[],
+    return_type=NumberType(),
+)
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/llm_as_judge_8b_correctness.py
@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
+from llama_stack.apis.scoring import *  # noqa: F401, F403
+from llama_stack.apis.common.type_system import NumberType
+
+JUDGE_PROMPT = """
+You will be given a question, a expected_answer, and a system_answer.
+Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.
+Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question.
+Provide your feedback as follows:
+Feedback:::
+Total rating: (your rating, as a int between 0 and 5)
+Now here are the question, expected_answer, system_answer.
+Question: {input_query}
+Expected Answer: {expected_answer}
+System Answer: {generated_answer}
+Feedback:::
+Total rating:
+"""
+
+llm_as_judge_8b_correctness = ScoringFnDef(
+    identifier="meta-reference::llm_as_judge_8b_correctness",
+    description="Llm As Judge Scoring Function",
+    parameters=[],
+    return_type=NumberType(),
+    context=LLMAsJudgeContext(
+        prompt_template=JUDGE_PROMPT,
+        judge_model="Llama3.1-8B-Instruct",
+        judge_score_regex=[r"Total rating: (\d+)", r"rating: (\d+)", r"Rating: (\d+)"],
+    ),
+)
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/subset_of.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/fn_defs/subset_of.py
@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import ScoringFnDef
+
+
+subset_of = ScoringFnDef(
+    identifier="meta-reference::subset_of",
+    description="Returns 1.0 if the expected is included in generated, 0.0 otherwise.",
+    parameters=[],
+    return_type=NumberType(),
+)
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/llm_as_judge_scoring_fn.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/llm_as_judge_scoring_fn.py
@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.inference.inference import Inference
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import (
+    BaseScoringFn,
+)
+from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
+from llama_stack.apis.scoring import *  # noqa: F401, F403
+from llama_stack.apis.common.type_system import *  # noqa: F403
+import re
+
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
+    aggregate_average,
+)
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.llm_as_judge_8b_correctness import (
+    llm_as_judge_8b_correctness,
+)
+
+
+class LlmAsJudgeScoringFn(BaseScoringFn):
+    """
+    A scoring_fn that assigns
+    """
+
+    def __init__(self, inference_api: Inference, *arg, **kwargs) -> None:
+        super().__init__(*arg, **kwargs)
+        self.inference_api = inference_api
+        self.supported_fn_defs_registry = {
+            llm_as_judge_8b_correctness.identifier: llm_as_judge_8b_correctness,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = None,
+    ) -> ScoringResultRow:
+        assert (
+            scoring_fn_identifier is not None
+        ), "Scoring function identifier not found."
+        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
+        assert fn_def.context is not None, f"LLMAsJudgeContext not found for {fn_def}."
+        assert (
+            fn_def.context.prompt_template is not None
+        ), "LLM Judge prompt_template not found."
+        assert (
+            fn_def.context.judge_score_regex is not None
+        ), "LLM Judge judge_score_regex not found."
+
+        input_query = input_row["input_query"]
+        expected_answer = input_row["expected_answer"]
+        generated_answer = input_row["generated_answer"]
+
+        judge_input_msg = fn_def.context.prompt_template.format(
+            input_query=input_query,
+            expected_answer=expected_answer,
+            generated_answer=generated_answer,
+        )
+
+        judge_response = await self.inference_api.chat_completion(
+            model=fn_def.context.judge_model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": judge_input_msg,
+                }
+            ],
+        )
+        content = judge_response.completion_message.content
+        rating_regexs = fn_def.context.judge_score_regex
+
+        judge_rating = None
+        for regex in rating_regexs:
+            match = re.search(regex, content)
+            if match:
+                judge_rating = int(match.group(1))
+                break
+
+        return {
+            "score": judge_rating,
+            "judge_feedback": content,
+        }
+
+    async def aggregate(
+        self, scoring_results: List[ScoringResultRow]
+    ) -> Dict[str, Any]:
+        return aggregate_average(scoring_results)
--- a/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py
+++ b/llama_stack/providers/inline/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py
@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.base_scoring_fn import (
+    BaseScoringFn,
+)
+from llama_stack.apis.scoring_functions import *  # noqa: F401, F403
+from llama_stack.apis.scoring import *  # noqa: F401, F403
+from llama_stack.apis.common.type_system import *  # noqa: F403
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.common import (
+    aggregate_accuracy,
+)
+
+from llama_stack.providers.inline.meta_reference.scoring.scoring_fn.fn_defs.subset_of import (
+    subset_of,
+)
+
+
+class SubsetOfScoringFn(BaseScoringFn):
+    """
+    A scoring_fn that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise.
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            subset_of.identifier: subset_of,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = "subset_of",
+    ) -> ScoringResultRow:
+        expected_answer = input_row["expected_answer"]
+        generated_answer = input_row["generated_answer"]
+        score = 1.0 if expected_answer in generated_answer else 0.0
+        return {
+            "score": score,
+        }
+
+    async def aggregate(
+        self, scoring_results: List[ScoringResultRow]
+    ) -> Dict[str, Any]:
+        return aggregate_accuracy(scoring_results)
--- a/llama_stack/providers/inline/meta_reference/telemetry/init.py
+++ b/llama_stack/providers/inline/meta_reference/telemetry/init.py
@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import ConsoleConfig
+
+
+async def get_provider_impl(config: ConsoleConfig, _deps):
+    from .console import ConsoleTelemetryImpl
+
+    impl = ConsoleTelemetryImpl(config)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/inline/meta_reference/telemetry/config.py
+++ b/llama_stack/providers/inline/meta_reference/telemetry/config.py
@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_models.schema_utils import json_schema_type
+
+from pydantic import BaseModel
+
+
+@json_schema_type
+class ConsoleConfig(BaseModel): ...
--- a/llama_stack/providers/inline/meta_reference/telemetry/console.py
+++ b/llama_stack/providers/inline/meta_reference/telemetry/console.py
@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Optional
+
+from llama_stack.apis.telemetry import *  # noqa: F403
+from .config import ConsoleConfig
+
+
+class ConsoleTelemetryImpl(Telemetry):
+    def __init__(self, config: ConsoleConfig) -> None:
+        self.config = config
+        self.spans = {}
+
+    async def initialize(self) -> None: ...
+
+    async def shutdown(self) -> None: ...
+
+    async def log_event(self, event: Event):
+        if (
+            isinstance(event, StructuredLogEvent)
+            and event.payload.type == StructuredLogType.SPAN_START.value
+        ):
+            self.spans[event.span_id] = event.payload
+
+        names = []
+        span_id = event.span_id
+        while True:
+            span_payload = self.spans.get(span_id)
+            if not span_payload:
+                break
+
+            names = [span_payload.name] + names
+            span_id = span_payload.parent_span_id
+
+        span_name = ".".join(names) if names else None
+
+        formatted = format_event(event, span_name)
+        if formatted:
+            print(formatted)
+
+    async def get_trace(self, trace_id: str) -> Trace:
+        raise NotImplementedError()
+
+
+COLORS = {
+    "reset": "\033[0m",
+    "bold": "\033[1m",
+    "dim": "\033[2m",
+    "red": "\033[31m",
+    "green": "\033[32m",
+    "yellow": "\033[33m",
+    "blue": "\033[34m",
+    "magenta": "\033[35m",
+    "cyan": "\033[36m",
+    "white": "\033[37m",
+}
+
+SEVERITY_COLORS = {
+    LogSeverity.VERBOSE: COLORS["dim"] + COLORS["white"],
+    LogSeverity.DEBUG: COLORS["cyan"],
+    LogSeverity.INFO: COLORS["green"],
+    LogSeverity.WARN: COLORS["yellow"],
+    LogSeverity.ERROR: COLORS["red"],
+    LogSeverity.CRITICAL: COLORS["bold"] + COLORS["red"],
+}
+
+
+def format_event(event: Event, span_name: str) -> Optional[str]:
+    timestamp = event.timestamp.strftime("%H:%M:%S.%f")[:-3]
+    span = ""
+    if span_name:
+        span = f"{COLORS['magenta']}[{span_name}]{COLORS['reset']} "
+    if isinstance(event, UnstructuredLogEvent):
+        severity_color = SEVERITY_COLORS.get(event.severity, COLORS["reset"])
+        return (
+            f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
+            f"{severity_color}[{event.severity.name}]{COLORS['reset']} "
+            f"{span}"
+            f"{event.message}"
+        )
+
+    elif isinstance(event, StructuredLogEvent):
+        return None
+
+    return f"Unknown event type: {event}"