llama_toolchain -> llama_stack

2025-12-03 18:00:36 +00:00 · 2024-09-16 17:21:08 -07:00 · 2024-09-16 17:21:08 -07:00 · 2cf731faea
commit 2cf731faea
parent f372355409
175 changed files with 300 additions and 279 deletions
--- a/llama_stack/init.py
+++ b/llama_stack/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/agentic_system/init.py
+++ b/llama_stack/agentic_system/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/agentic_system/api/init.py
+++ b/llama_stack/agentic_system/api/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .api import *  # noqa: F401 F403
--- a/llama_stack/agentic_system/api/api.py
+++ b/llama_stack/agentic_system/api/api.py
@ -0,0 +1,467 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from datetime import datetime
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Protocol, Union
+
+from llama_models.schema_utils import json_schema_type, webmethod
+
+from pydantic import BaseModel, ConfigDict, Field
+from typing_extensions import Annotated
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.common.deployment_types import *  # noqa: F403
+from llama_stack.inference.api import *  # noqa: F403
+from llama_stack.safety.api import *  # noqa: F403
+from llama_stack.memory.api import *  # noqa: F403
+
+
+@json_schema_type
+class Attachment(BaseModel):
+    content: InterleavedTextMedia | URL
+    mime_type: str
+
+
+class AgenticSystemTool(Enum):
+    brave_search = "brave_search"
+    wolfram_alpha = "wolfram_alpha"
+    photogen = "photogen"
+    code_interpreter = "code_interpreter"
+
+    function_call = "function_call"
+    memory = "memory"
+
+
+class ToolDefinitionCommon(BaseModel):
+    input_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
+    output_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
+
+
+class SearchEngineType(Enum):
+    bing = "bing"
+    brave = "brave"
+
+
+@json_schema_type
+class SearchToolDefinition(ToolDefinitionCommon):
+    # NOTE: brave_search is just a placeholder since model always uses
+    # brave_search as tool call name
+    type: Literal[AgenticSystemTool.brave_search.value] = (
+        AgenticSystemTool.brave_search.value
+    )
+    engine: SearchEngineType = SearchEngineType.brave
+    remote_execution: Optional[RestAPIExecutionConfig] = None
+
+
+@json_schema_type
+class WolframAlphaToolDefinition(ToolDefinitionCommon):
+    type: Literal[AgenticSystemTool.wolfram_alpha.value] = (
+        AgenticSystemTool.wolfram_alpha.value
+    )
+    remote_execution: Optional[RestAPIExecutionConfig] = None
+
+
+@json_schema_type
+class PhotogenToolDefinition(ToolDefinitionCommon):
+    type: Literal[AgenticSystemTool.photogen.value] = AgenticSystemTool.photogen.value
+    remote_execution: Optional[RestAPIExecutionConfig] = None
+
+
+@json_schema_type
+class CodeInterpreterToolDefinition(ToolDefinitionCommon):
+    type: Literal[AgenticSystemTool.code_interpreter.value] = (
+        AgenticSystemTool.code_interpreter.value
+    )
+    enable_inline_code_execution: bool = True
+    remote_execution: Optional[RestAPIExecutionConfig] = None
+
+
+@json_schema_type
+class FunctionCallToolDefinition(ToolDefinitionCommon):
+    type: Literal[AgenticSystemTool.function_call.value] = (
+        AgenticSystemTool.function_call.value
+    )
+    function_name: str
+    description: str
+    parameters: Dict[str, ToolParamDefinition]
+    remote_execution: Optional[RestAPIExecutionConfig] = None
+
+
+class _MemoryBankConfigCommon(BaseModel):
+    bank_id: str
+
+
+class AgenticSystemVectorMemoryBankConfig(_MemoryBankConfigCommon):
+    type: Literal[MemoryBankType.vector.value] = MemoryBankType.vector.value
+
+
+class AgenticSystemKeyValueMemoryBankConfig(_MemoryBankConfigCommon):
+    type: Literal[MemoryBankType.keyvalue.value] = MemoryBankType.keyvalue.value
+    keys: List[str]  # what keys to focus on
+
+
+class AgenticSystemKeywordMemoryBankConfig(_MemoryBankConfigCommon):
+    type: Literal[MemoryBankType.keyword.value] = MemoryBankType.keyword.value
+
+
+class AgenticSystemGraphMemoryBankConfig(_MemoryBankConfigCommon):
+    type: Literal[MemoryBankType.graph.value] = MemoryBankType.graph.value
+    entities: List[str]  # what entities to focus on
+
+
+MemoryBankConfig = Annotated[
+    Union[
+        AgenticSystemVectorMemoryBankConfig,
+        AgenticSystemKeyValueMemoryBankConfig,
+        AgenticSystemKeywordMemoryBankConfig,
+        AgenticSystemGraphMemoryBankConfig,
+    ],
+    Field(discriminator="type"),
+]
+
+
+class MemoryQueryGenerator(Enum):
+    default = "default"
+    llm = "llm"
+    custom = "custom"
+
+
+class DefaultMemoryQueryGeneratorConfig(BaseModel):
+    type: Literal[MemoryQueryGenerator.default.value] = (
+        MemoryQueryGenerator.default.value
+    )
+    sep: str = " "
+
+
+class LLMMemoryQueryGeneratorConfig(BaseModel):
+    type: Literal[MemoryQueryGenerator.llm.value] = MemoryQueryGenerator.llm.value
+    model: str
+    template: str
+
+
+class CustomMemoryQueryGeneratorConfig(BaseModel):
+    type: Literal[MemoryQueryGenerator.custom.value] = MemoryQueryGenerator.custom.value
+
+
+MemoryQueryGeneratorConfig = Annotated[
+    Union[
+        DefaultMemoryQueryGeneratorConfig,
+        LLMMemoryQueryGeneratorConfig,
+        CustomMemoryQueryGeneratorConfig,
+    ],
+    Field(discriminator="type"),
+]
+
+
+class MemoryToolDefinition(ToolDefinitionCommon):
+    type: Literal[AgenticSystemTool.memory.value] = AgenticSystemTool.memory.value
+    memory_bank_configs: List[MemoryBankConfig] = Field(default_factory=list)
+    # This config defines how a query is generated using the messages
+    # for memory bank retrieval.
+    query_generator_config: MemoryQueryGeneratorConfig = Field(
+        default=DefaultMemoryQueryGeneratorConfig()
+    )
+    max_tokens_in_context: int = 4096
+    max_chunks: int = 10
+
+
+AgenticSystemToolDefinition = Annotated[
+    Union[
+        SearchToolDefinition,
+        WolframAlphaToolDefinition,
+        PhotogenToolDefinition,
+        CodeInterpreterToolDefinition,
+        FunctionCallToolDefinition,
+        MemoryToolDefinition,
+    ],
+    Field(discriminator="type"),
+]
+
+
+class StepCommon(BaseModel):
+    turn_id: str
+    step_id: str
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+
+
+class StepType(Enum):
+    inference = "inference"
+    tool_execution = "tool_execution"
+    shield_call = "shield_call"
+    memory_retrieval = "memory_retrieval"
+
+
+@json_schema_type
+class InferenceStep(StepCommon):
+    model_config = ConfigDict(protected_namespaces=())
+
+    step_type: Literal[StepType.inference.value] = StepType.inference.value
+    model_response: CompletionMessage
+
+
+@json_schema_type
+class ToolExecutionStep(StepCommon):
+    step_type: Literal[StepType.tool_execution.value] = StepType.tool_execution.value
+    tool_calls: List[ToolCall]
+    tool_responses: List[ToolResponse]
+
+
+@json_schema_type
+class ShieldCallStep(StepCommon):
+    step_type: Literal[StepType.shield_call.value] = StepType.shield_call.value
+    response: ShieldResponse
+
+
+@json_schema_type
+class MemoryRetrievalStep(StepCommon):
+    step_type: Literal[StepType.memory_retrieval.value] = (
+        StepType.memory_retrieval.value
+    )
+    memory_bank_ids: List[str]
+    inserted_context: InterleavedTextMedia
+
+
+Step = Annotated[
+    Union[
+        InferenceStep,
+        ToolExecutionStep,
+        ShieldCallStep,
+        MemoryRetrievalStep,
+    ],
+    Field(discriminator="step_type"),
+]
+
+
+@json_schema_type
+class Turn(BaseModel):
+    """A single turn in an interaction with an Agentic System."""
+
+    turn_id: str
+    session_id: str
+    input_messages: List[
+        Union[
+            UserMessage,
+            ToolResponseMessage,
+        ]
+    ]
+    steps: List[Step]
+    output_message: CompletionMessage
+    output_attachments: List[Attachment] = Field(default_factory=list)
+
+    started_at: datetime
+    completed_at: Optional[datetime] = None
+
+
+@json_schema_type
+class Session(BaseModel):
+    """A single session of an interaction with an Agentic System."""
+
+    session_id: str
+    session_name: str
+    turns: List[Turn]
+    started_at: datetime
+
+    memory_bank: Optional[MemoryBank] = None
+
+
+class AgentConfigCommon(BaseModel):
+    sampling_params: Optional[SamplingParams] = SamplingParams()
+
+    input_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
+    output_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
+
+    tools: Optional[List[AgenticSystemToolDefinition]] = Field(default_factory=list)
+    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
+    tool_prompt_format: Optional[ToolPromptFormat] = Field(
+        default=ToolPromptFormat.json
+    )
+
+
+@json_schema_type
+class AgentConfig(AgentConfigCommon):
+    model: str
+    instructions: str
+
+
+class AgentConfigOverridablePerTurn(AgentConfigCommon):
+    instructions: Optional[str] = None
+
+
+class AgenticSystemTurnResponseEventType(Enum):
+    step_start = "step_start"
+    step_complete = "step_complete"
+    step_progress = "step_progress"
+
+    turn_start = "turn_start"
+    turn_complete = "turn_complete"
+
+
+@json_schema_type
+class AgenticSystemTurnResponseStepStartPayload(BaseModel):
+    event_type: Literal[AgenticSystemTurnResponseEventType.step_start.value] = (
+        AgenticSystemTurnResponseEventType.step_start.value
+    )
+    step_type: StepType
+    step_id: str
+    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
+
+
+@json_schema_type
+class AgenticSystemTurnResponseStepCompletePayload(BaseModel):
+    event_type: Literal[AgenticSystemTurnResponseEventType.step_complete.value] = (
+        AgenticSystemTurnResponseEventType.step_complete.value
+    )
+    step_type: StepType
+    step_details: Step
+
+
+@json_schema_type
+class AgenticSystemTurnResponseStepProgressPayload(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+
+    event_type: Literal[AgenticSystemTurnResponseEventType.step_progress.value] = (
+        AgenticSystemTurnResponseEventType.step_progress.value
+    )
+    step_type: StepType
+    step_id: str
+
+    model_response_text_delta: Optional[str] = None
+    tool_call_delta: Optional[ToolCallDelta] = None
+    tool_response_text_delta: Optional[str] = None
+
+
+@json_schema_type
+class AgenticSystemTurnResponseTurnStartPayload(BaseModel):
+    event_type: Literal[AgenticSystemTurnResponseEventType.turn_start.value] = (
+        AgenticSystemTurnResponseEventType.turn_start.value
+    )
+    turn_id: str
+
+
+@json_schema_type
+class AgenticSystemTurnResponseTurnCompletePayload(BaseModel):
+    event_type: Literal[AgenticSystemTurnResponseEventType.turn_complete.value] = (
+        AgenticSystemTurnResponseEventType.turn_complete.value
+    )
+    turn: Turn
+
+
+@json_schema_type
+class AgenticSystemTurnResponseEvent(BaseModel):
+    """Streamed agent execution response."""
+
+    payload: Annotated[
+        Union[
+            AgenticSystemTurnResponseStepStartPayload,
+            AgenticSystemTurnResponseStepProgressPayload,
+            AgenticSystemTurnResponseStepCompletePayload,
+            AgenticSystemTurnResponseTurnStartPayload,
+            AgenticSystemTurnResponseTurnCompletePayload,
+        ],
+        Field(discriminator="event_type"),
+    ]
+
+
+@json_schema_type
+class AgenticSystemCreateResponse(BaseModel):
+    agent_id: str
+
+
+@json_schema_type
+class AgenticSystemSessionCreateResponse(BaseModel):
+    session_id: str
+
+
+@json_schema_type
+class AgenticSystemTurnCreateRequest(AgentConfigOverridablePerTurn):
+    agent_id: str
+    session_id: str
+
+    # TODO: figure out how we can simplify this and make why
+    # ToolResponseMessage needs to be here (it is function call
+    # execution from outside the system)
+    messages: List[
+        Union[
+            UserMessage,
+            ToolResponseMessage,
+        ]
+    ]
+    attachments: Optional[List[Attachment]] = None
+
+    stream: Optional[bool] = False
+
+
+@json_schema_type
+class AgenticSystemTurnResponseStreamChunk(BaseModel):
+    event: AgenticSystemTurnResponseEvent
+
+
+@json_schema_type
+class AgenticSystemStepResponse(BaseModel):
+    step: Step
+
+
+class AgenticSystem(Protocol):
+    @webmethod(route="/agentic_system/create")
+    async def create_agentic_system(
+        self,
+        agent_config: AgentConfig,
+    ) -> AgenticSystemCreateResponse: ...
+
+    @webmethod(route="/agentic_system/turn/create")
+    async def create_agentic_system_turn(
+        self,
+        agent_id: str,
+        session_id: str,
+        messages: List[
+            Union[
+                UserMessage,
+                ToolResponseMessage,
+            ]
+        ],
+        attachments: Optional[List[Attachment]] = None,
+        stream: Optional[bool] = False,
+    ) -> AgenticSystemTurnResponseStreamChunk: ...
+
+    @webmethod(route="/agentic_system/turn/get")
+    async def get_agentic_system_turn(
+        self,
+        agent_id: str,
+        turn_id: str,
+    ) -> Turn: ...
+
+    @webmethod(route="/agentic_system/step/get")
+    async def get_agentic_system_step(
+        self, agent_id: str, turn_id: str, step_id: str
+    ) -> AgenticSystemStepResponse: ...
+
+    @webmethod(route="/agentic_system/session/create")
+    async def create_agentic_system_session(
+        self,
+        agent_id: str,
+        session_name: str,
+    ) -> AgenticSystemSessionCreateResponse: ...
+
+    @webmethod(route="/agentic_system/session/get")
+    async def get_agentic_system_session(
+        self,
+        agent_id: str,
+        session_id: str,
+        turn_ids: Optional[List[str]] = None,
+    ) -> Session: ...
+
+    @webmethod(route="/agentic_system/session/delete")
+    async def delete_agentic_system_session(
+        self, agent_id: str, session_id: str
+    ) -> None: ...
+
+    @webmethod(route="/agentic_system/delete")
+    async def delete_agentic_system(
+        self,
+        agent_id: str,
+    ) -> None: ...
--- a/llama_stack/agentic_system/client.py
+++ b/llama_stack/agentic_system/client.py
@ -0,0 +1,212 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import json
+from typing import AsyncGenerator
+
+import fire
+
+import httpx
+
+from pydantic import BaseModel
+from termcolor import cprint
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.core.datatypes import RemoteProviderConfig
+
+from .api import *  # noqa: F403
+from .event_logger import EventLogger
+
+
+async def get_client_impl(config: RemoteProviderConfig, _deps):
+    return AgenticSystemClient(config.url)
+
+
+def encodable_dict(d: BaseModel):
+    return json.loads(d.json())
+
+
+class AgenticSystemClient(AgenticSystem):
+    def __init__(self, base_url: str):
+        self.base_url = base_url
+
+    async def create_agentic_system(
+        self, agent_config: AgentConfig
+    ) -> AgenticSystemCreateResponse:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/agentic_system/create",
+                json={
+                    "agent_config": encodable_dict(agent_config),
+                },
+                headers={"Content-Type": "application/json"},
+            )
+            response.raise_for_status()
+            return AgenticSystemCreateResponse(**response.json())
+
+    async def create_agentic_system_session(
+        self,
+        agent_id: str,
+        session_name: str,
+    ) -> AgenticSystemSessionCreateResponse:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{self.base_url}/agentic_system/session/create",
+                json={
+                    "agent_id": agent_id,
+                    "session_name": session_name,
+                },
+                headers={"Content-Type": "application/json"},
+            )
+            response.raise_for_status()
+            return AgenticSystemSessionCreateResponse(**response.json())
+
+    async def create_agentic_system_turn(
+        self,
+        request: AgenticSystemTurnCreateRequest,
+    ) -> AsyncGenerator:
+        async with httpx.AsyncClient() as client:
+            async with client.stream(
+                "POST",
+                f"{self.base_url}/agentic_system/turn/create",
+                json=encodable_dict(request),
+                headers={"Content-Type": "application/json"},
+                timeout=20,
+            ) as response:
+                async for line in response.aiter_lines():
+                    if line.startswith("data:"):
+                        data = line[len("data: ") :]
+                        try:
+                            jdata = json.loads(data)
+                            if "error" in jdata:
+                                cprint(data, "red")
+                                continue
+
+                            yield AgenticSystemTurnResponseStreamChunk(**jdata)
+                        except Exception as e:
+                            print(data)
+                            print(f"Error with parsing or validation: {e}")
+
+
+async def _run_agent(api, tool_definitions, user_prompts, attachments=None):
+    agent_config = AgentConfig(
+        model="Meta-Llama3.1-8B-Instruct",
+        instructions="You are a helpful assistant",
+        sampling_params=SamplingParams(temperature=1.0, top_p=0.9),
+        tools=tool_definitions,
+        tool_choice=ToolChoice.auto,
+        tool_prompt_format=ToolPromptFormat.function_tag,
+    )
+
+    create_response = await api.create_agentic_system(agent_config)
+    session_response = await api.create_agentic_system_session(
+        agent_id=create_response.agent_id,
+        session_name="test_session",
+    )
+
+    for content in user_prompts:
+        cprint(f"User> {content}", color="white", attrs=["bold"])
+        iterator = api.create_agentic_system_turn(
+            AgenticSystemTurnCreateRequest(
+                agent_id=create_response.agent_id,
+                session_id=session_response.session_id,
+                messages=[
+                    UserMessage(content=content),
+                ],
+                attachments=attachments,
+                stream=True,
+            )
+        )
+
+        async for event, log in EventLogger().log(iterator):
+            if log is not None:
+                log.print()
+
+
+async def run_main(host: str, port: int):
+    api = AgenticSystemClient(f"http://{host}:{port}")
+
+    tool_definitions = [
+        SearchToolDefinition(engine=SearchEngineType.bing),
+        WolframAlphaToolDefinition(),
+        CodeInterpreterToolDefinition(),
+    ]
+    tool_definitions += [
+        FunctionCallToolDefinition(
+            function_name="get_boiling_point",
+            description="Get the boiling point of a imaginary liquids (eg. polyjuice)",
+            parameters={
+                "liquid_name": ToolParamDefinition(
+                    param_type="str",
+                    description="The name of the liquid",
+                    required=True,
+                ),
+                "celcius": ToolParamDefinition(
+                    param_type="str",
+                    description="Whether to return the boiling point in Celcius",
+                    required=False,
+                ),
+            },
+        ),
+    ]
+
+    user_prompts = [
+        "Who are you?",
+        "what is the 100th prime number?",
+        "Search web for who was 44th President of USA?",
+        "Write code to check if a number is prime. Use that to check if 7 is prime",
+        "What is the boiling point of polyjuicepotion ?",
+    ]
+    await _run_agent(api, tool_definitions, user_prompts)
+
+
+async def run_rag(host: str, port: int):
+    api = AgenticSystemClient(f"http://{host}:{port}")
+
+    urls = [
+        "memory_optimizations.rst",
+        "chat.rst",
+        "llama3.rst",
+        "datasets.rst",
+        "qat_finetune.rst",
+        "lora_finetune.rst",
+    ]
+    attachments = [
+        Attachment(
+            content=URL(
+                uri=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}"
+            ),
+            mime_type="text/plain",
+        )
+        for i, url in enumerate(urls)
+    ]
+
+    # Alternatively, you can pre-populate the memory bank with documents for example,
+    # using `llama_stack.memory.client`. Then you can grab the bank_id
+    # from the output of that run.
+    tool_definitions = [
+        MemoryToolDefinition(
+            max_tokens_in_context=2048,
+            memory_bank_configs=[],
+        ),
+    ]
+
+    user_prompts = [
+        "How do I use Lora?",
+        "Tell me briefly about llama3 and torchtune",
+    ]
+
+    await _run_agent(api, tool_definitions, user_prompts, attachments)
+
+
+def main(host: str, port: int, rag: bool = False):
+    fn = run_rag if rag else run_main
+    asyncio.run(fn(host, port))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/llama_stack/agentic_system/event_logger.py
+++ b/llama_stack/agentic_system/event_logger.py
@ -0,0 +1,184 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Optional
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_models.llama3.api.tool_utils import ToolUtils
+
+from termcolor import cprint
+
+from llama_stack.agentic_system.api import AgenticSystemTurnResponseEventType, StepType
+
+
+class LogEvent:
+    def __init__(
+        self,
+        role: Optional[str] = None,
+        content: str = "",
+        end: str = "\n",
+        color="white",
+    ):
+        self.role = role
+        self.content = content
+        self.color = color
+        self.end = "\n" if end is None else end
+
+    def __str__(self):
+        if self.role is not None:
+            return f"{self.role}> {self.content}"
+        else:
+            return f"{self.content}"
+
+    def print(self, flush=True):
+        cprint(f"{str(self)}", color=self.color, end=self.end, flush=flush)
+
+
+EventType = AgenticSystemTurnResponseEventType
+
+
+class EventLogger:
+    async def log(
+        self,
+        event_generator,
+        stream=True,
+        tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
+    ):
+        previous_event_type = None
+        previous_step_type = None
+
+        async for chunk in event_generator:
+            if not hasattr(chunk, "event"):
+                # Need to check for custom tool first
+                # since it does not produce event but instead
+                # a Message
+                if isinstance(chunk, ToolResponseMessage):
+                    yield chunk, LogEvent(
+                        role="CustomTool", content=chunk.content, color="grey"
+                    )
+                continue
+
+            event = chunk.event
+            event_type = event.payload.event_type
+            if event_type in {
+                EventType.turn_start.value,
+                EventType.turn_complete.value,
+            }:
+                # Currently not logging any turn realted info
+                yield event, None
+                continue
+
+            step_type = event.payload.step_type
+            # handle safety
+            if (
+                step_type == StepType.shield_call
+                and event_type == EventType.step_complete.value
+            ):
+                response = event.payload.step_details.response
+                if not response.is_violation:
+                    yield event, LogEvent(
+                        role=step_type, content="No Violation", color="magenta"
+                    )
+                else:
+                    yield event, LogEvent(
+                        role=step_type,
+                        content=f"{response.violation_type} {response.violation_return_message}",
+                        color="red",
+                    )
+
+            # handle inference
+            if step_type == StepType.inference:
+                if stream:
+                    if event_type == EventType.step_start.value:
+                        # TODO: Currently this event is never received
+                        yield event, LogEvent(
+                            role=step_type, content="", end="", color="yellow"
+                        )
+                    elif event_type == EventType.step_progress.value:
+                        # HACK: if previous was not step/event was not inference's step_progress
+                        # this is the first time we are getting model inference response
+                        # aka equivalent to step_start for inference. Hence,
+                        # start with "Model>".
+                        if (
+                            previous_event_type != EventType.step_progress.value
+                            and previous_step_type != StepType.inference
+                        ):
+                            yield event, LogEvent(
+                                role=step_type, content="", end="", color="yellow"
+                            )
+
+                        if event.payload.tool_call_delta:
+                            if isinstance(event.payload.tool_call_delta.content, str):
+                                yield event, LogEvent(
+                                    role=None,
+                                    content=event.payload.tool_call_delta.content,
+                                    end="",
+                                    color="cyan",
+                                )
+                        else:
+                            yield event, LogEvent(
+                                role=None,
+                                content=event.payload.model_response_text_delta,
+                                end="",
+                                color="yellow",
+                            )
+                    else:
+                        # step_complete
+                        yield event, LogEvent(role=None, content="")
+
+                else:
+                    # Not streaming
+                    if event_type == EventType.step_complete.value:
+                        response = event.payload.step_details.model_response
+                        if response.tool_calls:
+                            content = ToolUtils.encode_tool_call(
+                                response.tool_calls[0], tool_prompt_format
+                            )
+                        else:
+                            content = response.content
+                        yield event, LogEvent(
+                            role=step_type,
+                            content=content,
+                            color="yellow",
+                        )
+
+            # handle tool_execution
+            if (
+                step_type == StepType.tool_execution
+                and
+                # Only print tool calls and responses at the step_complete event
+                event_type == EventType.step_complete.value
+            ):
+                details = event.payload.step_details
+                for t in details.tool_calls:
+                    yield event, LogEvent(
+                        role=step_type,
+                        content=f"Tool:{t.tool_name} Args:{t.arguments}",
+                        color="green",
+                    )
+                for r in details.tool_responses:
+                    yield event, LogEvent(
+                        role=step_type,
+                        content=f"Tool:{r.tool_name} Response:{r.content}",
+                        color="green",
+                    )
+
+            if (
+                step_type == StepType.memory_retrieval
+                and event_type == EventType.step_complete.value
+            ):
+                details = event.payload.step_details
+                content = interleaved_text_media_as_str(details.inserted_context)
+                content = content[:200] + "..." if len(content) > 200 else content
+
+                yield event, LogEvent(
+                    role=step_type,
+                    content=f"Retrieved context from banks: {details.memory_bank_ids}.\n====\n{content}\n>",
+                    color="cyan",
+                )
+
+            preivous_event_type = event_type
+            previous_step_type = step_type
--- a/llama_stack/agentic_system/execute_with_custom_tools.py
+++ b/llama_stack/agentic_system/execute_with_custom_tools.py
@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import AsyncGenerator, List
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.agentic_system.api import *  # noqa: F403
+from llama_stack.memory.api import *  # noqa: F403
+from llama_stack.safety.api import *  # noqa: F403
+
+from llama_stack.agentic_system.api import (
+    AgenticSystemTurnResponseEventType as EventType,
+)
+from llama_stack.tools.custom.datatypes import CustomTool
+
+
+class AgentWithCustomToolExecutor:
+    def __init__(
+        self,
+        api: AgenticSystem,
+        agent_id: str,
+        session_id: str,
+        agent_config: AgentConfig,
+        custom_tools: List[CustomTool],
+    ):
+        self.api = api
+        self.agent_id = agent_id
+        self.session_id = session_id
+        self.agent_config = agent_config
+        self.custom_tools = custom_tools
+
+    async def execute_turn(
+        self,
+        messages: List[Message],
+        attachments: Optional[List[Attachment]] = None,
+        max_iters: int = 5,
+        stream: bool = True,
+    ) -> AsyncGenerator:
+        tools_dict = {t.get_name(): t for t in self.custom_tools}
+
+        current_messages = messages.copy()
+        n_iter = 0
+        while n_iter < max_iters:
+            n_iter += 1
+
+            request = AgenticSystemTurnCreateRequest(
+                agent_id=self.agent_id,
+                session_id=self.session_id,
+                messages=current_messages,
+                attachments=attachments,
+                stream=stream,
+            )
+
+            turn = None
+            async for chunk in self.api.create_agentic_system_turn(request):
+                if chunk.event.payload.event_type != EventType.turn_complete.value:
+                    yield chunk
+                else:
+                    turn = chunk.event.payload.turn
+
+            message = turn.output_message
+            if len(message.tool_calls) == 0:
+                yield chunk
+                return
+
+            if message.stop_reason == StopReason.out_of_tokens:
+                yield chunk
+                return
+
+            tool_call = message.tool_calls[0]
+            if tool_call.tool_name not in tools_dict:
+                m = ToolResponseMessage(
+                    call_id=tool_call.call_id,
+                    tool_name=tool_call.tool_name,
+                    content=f"Unknown tool `{tool_call.tool_name}` was called. Try again with something else",
+                )
+                next_message = m
+            else:
+                tool = tools_dict[tool_call.tool_name]
+                result_messages = await execute_custom_tool(tool, message)
+                next_message = result_messages[0]
+
+            yield next_message
+            current_messages = [next_message]
+
+
+async def execute_custom_tool(tool: CustomTool, message: Message) -> List[Message]:
+    result_messages = await tool.run([message])
+    assert (
+        len(result_messages) == 1
+    ), f"Expected single message, got {len(result_messages)}"
+
+    return result_messages
--- a/llama_stack/agentic_system/meta_reference/init.py
+++ b/llama_stack/agentic_system/meta_reference/init.py
@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Dict
+
+from llama_stack.core.datatypes import Api, ProviderSpec
+
+from .config import MetaReferenceImplConfig
+
+
+async def get_provider_impl(
+    config: MetaReferenceImplConfig, deps: Dict[Api, ProviderSpec]
+):
+    from .agentic_system import MetaReferenceAgenticSystemImpl
+
+    assert isinstance(
+        config, MetaReferenceImplConfig
+    ), f"Unexpected config type: {type(config)}"
+
+    impl = MetaReferenceAgenticSystemImpl(
+        config,
+        deps[Api.inference],
+        deps[Api.memory],
+        deps[Api.safety],
+    )
+    await impl.initialize()
+    return impl
--- a/llama_stack/agentic_system/meta_reference/agent_instance.py
+++ b/llama_stack/agentic_system/meta_reference/agent_instance.py
@ -0,0 +1,797 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import copy
+import os
+import secrets
+import shutil
+import string
+import tempfile
+import uuid
+from datetime import datetime
+from typing import AsyncGenerator, List, Tuple
+from urllib.parse import urlparse
+
+import httpx
+
+from termcolor import cprint
+
+from llama_stack.agentic_system.api import *  # noqa: F403
+from llama_stack.inference.api import *  # noqa: F403
+from llama_stack.memory.api import *  # noqa: F403
+from llama_stack.safety.api import *  # noqa: F403
+
+from llama_stack.tools.base import BaseTool
+from llama_stack.tools.builtin import (
+    interpret_content_as_attachment,
+    SingleMessageBuiltinTool,
+)
+
+from .rag.context_retriever import generate_rag_query
+from .safety import SafetyException, ShieldRunnerMixin
+
+
+def make_random_string(length: int = 8):
+    return "".join(
+        secrets.choice(string.ascii_letters + string.digits) for _ in range(length)
+    )
+
+
+class ChatAgent(ShieldRunnerMixin):
+    def __init__(
+        self,
+        agent_config: AgentConfig,
+        inference_api: Inference,
+        memory_api: Memory,
+        safety_api: Safety,
+        builtin_tools: List[SingleMessageBuiltinTool],
+        max_infer_iters: int = 10,
+    ):
+        self.agent_config = agent_config
+        self.inference_api = inference_api
+        self.memory_api = memory_api
+        self.safety_api = safety_api
+
+        self.max_infer_iters = max_infer_iters
+        self.tools_dict = {t.get_name(): t for t in builtin_tools}
+
+        self.tempdir = tempfile.mkdtemp()
+        self.sessions = {}
+
+        ShieldRunnerMixin.__init__(
+            self,
+            safety_api,
+            input_shields=agent_config.input_shields,
+            output_shields=agent_config.output_shields,
+        )
+
+    def __del__(self):
+        shutil.rmtree(self.tempdir)
+
+    def turn_to_messages(self, turn: Turn) -> List[Message]:
+        messages = []
+
+        # We do not want to keep adding RAG context to the input messages
+        # May be this should be a parameter of the agentic instance
+        # that can define its behavior in a custom way
+        for m in turn.input_messages:
+            msg = m.copy()
+            if isinstance(msg, UserMessage):
+                msg.context = None
+            messages.append(msg)
+
+        # messages.extend(turn.input_messages)
+        for step in turn.steps:
+            if step.step_type == StepType.inference.value:
+                messages.append(step.model_response)
+            elif step.step_type == StepType.tool_execution.value:
+                for response in step.tool_responses:
+                    messages.append(
+                        ToolResponseMessage(
+                            call_id=response.call_id,
+                            tool_name=response.tool_name,
+                            content=response.content,
+                        )
+                    )
+            elif step.step_type == StepType.shield_call.value:
+                response = step.response
+                if response.is_violation:
+                    # CompletionMessage itself in the ShieldResponse
+                    messages.append(
+                        CompletionMessage(
+                            content=response.violation_return_message,
+                            stop_reason=StopReason.end_of_turn,
+                        )
+                    )
+        # print_dialog(messages)
+        return messages
+
+    def create_session(self, name: str) -> Session:
+        session_id = str(uuid.uuid4())
+        session = Session(
+            session_id=session_id,
+            session_name=name,
+            turns=[],
+            started_at=datetime.now(),
+        )
+        self.sessions[session_id] = session
+        return session
+
+    async def create_and_execute_turn(
+        self, request: AgenticSystemTurnCreateRequest
+    ) -> AsyncGenerator:
+        assert (
+            request.session_id in self.sessions
+        ), f"Session {request.session_id} not found"
+
+        session = self.sessions[request.session_id]
+
+        messages = []
+        for i, turn in enumerate(session.turns):
+            messages.extend(self.turn_to_messages(turn))
+
+        messages.extend(request.messages)
+
+        # print("processed dialog ======== ")
+        # print_dialog(messages)
+
+        turn_id = str(uuid.uuid4())
+        start_time = datetime.now()
+        yield AgenticSystemTurnResponseStreamChunk(
+            event=AgenticSystemTurnResponseEvent(
+                payload=AgenticSystemTurnResponseTurnStartPayload(
+                    turn_id=turn_id,
+                )
+            )
+        )
+
+        steps = []
+        output_message = None
+        async for chunk in self.run(
+            session=session,
+            turn_id=turn_id,
+            input_messages=messages,
+            attachments=request.attachments or [],
+            sampling_params=self.agent_config.sampling_params,
+            stream=request.stream,
+        ):
+            if isinstance(chunk, CompletionMessage):
+                cprint(
+                    f"{chunk.role.capitalize()}: {chunk.content}",
+                    "white",
+                    attrs=["bold"],
+                )
+                output_message = chunk
+                continue
+
+            assert isinstance(
+                chunk, AgenticSystemTurnResponseStreamChunk
+            ), f"Unexpected type {type(chunk)}"
+            event = chunk.event
+            if (
+                event.payload.event_type
+                == AgenticSystemTurnResponseEventType.step_complete.value
+            ):
+                steps.append(event.payload.step_details)
+
+            yield chunk
+
+        assert output_message is not None
+
+        turn = Turn(
+            turn_id=turn_id,
+            session_id=request.session_id,
+            input_messages=request.messages,
+            output_message=output_message,
+            started_at=start_time,
+            completed_at=datetime.now(),
+            steps=steps,
+        )
+        session.turns.append(turn)
+
+        chunk = AgenticSystemTurnResponseStreamChunk(
+            event=AgenticSystemTurnResponseEvent(
+                payload=AgenticSystemTurnResponseTurnCompletePayload(
+                    turn=turn,
+                )
+            )
+        )
+        yield chunk
+
+    async def run(
+        self,
+        session: Session,
+        turn_id: str,
+        input_messages: List[Message],
+        attachments: List[Attachment],
+        sampling_params: SamplingParams,
+        stream: bool = False,
+    ) -> AsyncGenerator:
+        # Doing async generators makes downstream code much simpler and everything amenable to
+        # streaming. However, it also makes things complicated here because AsyncGenerators cannot
+        # return a "final value" for the `yield from` statement. we simulate that by yielding a
+        # final boolean (to see whether an exception happened) and then explicitly testing for it.
+
+        async for res in self.run_shields_wrapper(
+            turn_id, input_messages, self.input_shields, "user-input"
+        ):
+            if isinstance(res, bool):
+                return
+            else:
+                yield res
+
+        async for res in self._run(
+            session, turn_id, input_messages, attachments, sampling_params, stream
+        ):
+            if isinstance(res, bool):
+                return
+            elif isinstance(res, CompletionMessage):
+                final_response = res
+                break
+            else:
+                yield res
+
+        assert final_response is not None
+        # for output shields run on the full input and output combination
+        messages = input_messages + [final_response]
+
+        async for res in self.run_shields_wrapper(
+            turn_id, messages, self.output_shields, "assistant-output"
+        ):
+            if isinstance(res, bool):
+                return
+            else:
+                yield res
+
+        yield final_response
+
+    async def run_shields_wrapper(
+        self,
+        turn_id: str,
+        messages: List[Message],
+        shields: List[ShieldDefinition],
+        touchpoint: str,
+    ) -> AsyncGenerator:
+        if len(shields) == 0:
+            return
+
+        step_id = str(uuid.uuid4())
+        try:
+            yield AgenticSystemTurnResponseStreamChunk(
+                event=AgenticSystemTurnResponseEvent(
+                    payload=AgenticSystemTurnResponseStepStartPayload(
+                        step_type=StepType.shield_call.value,
+                        step_id=step_id,
+                        metadata=dict(touchpoint=touchpoint),
+                    )
+                )
+            )
+            await self.run_shields(messages, shields)
+
+        except SafetyException as e:
+            yield AgenticSystemTurnResponseStreamChunk(
+                event=AgenticSystemTurnResponseEvent(
+                    payload=AgenticSystemTurnResponseStepCompletePayload(
+                        step_type=StepType.shield_call.value,
+                        step_details=ShieldCallStep(
+                            step_id=step_id,
+                            turn_id=turn_id,
+                            response=e.response,
+                        ),
+                    )
+                )
+            )
+
+            yield CompletionMessage(
+                content=str(e),
+                stop_reason=StopReason.end_of_turn,
+            )
+            yield False
+
+        yield AgenticSystemTurnResponseStreamChunk(
+            event=AgenticSystemTurnResponseEvent(
+                payload=AgenticSystemTurnResponseStepCompletePayload(
+                    step_type=StepType.shield_call.value,
+                    step_details=ShieldCallStep(
+                        step_id=step_id,
+                        turn_id=turn_id,
+                        response=ShieldResponse(
+                            # TODO: fix this, give each shield a shield type method and
+                            # fire one event for each shield run
+                            shield_type=BuiltinShield.llama_guard,
+                            is_violation=False,
+                        ),
+                    ),
+                )
+            )
+        )
+
+    async def _run(
+        self,
+        session: Session,
+        turn_id: str,
+        input_messages: List[Message],
+        attachments: List[Attachment],
+        sampling_params: SamplingParams,
+        stream: bool = False,
+    ) -> AsyncGenerator:
+        enabled_tools = set(t.type for t in self.agent_config.tools)
+        need_rag_context = await self._should_retrieve_context(
+            input_messages, attachments
+        )
+        if need_rag_context:
+            step_id = str(uuid.uuid4())
+            yield AgenticSystemTurnResponseStreamChunk(
+                event=AgenticSystemTurnResponseEvent(
+                    payload=AgenticSystemTurnResponseStepStartPayload(
+                        step_type=StepType.memory_retrieval.value,
+                        step_id=step_id,
+                    )
+                )
+            )
+
+            # TODO: find older context from the session and either replace it
+            # or append with a sliding window. this is really a very simplistic implementation
+            rag_context, bank_ids = await self._retrieve_context(
+                session, input_messages, attachments
+            )
+
+            step_id = str(uuid.uuid4())
+            yield AgenticSystemTurnResponseStreamChunk(
+                event=AgenticSystemTurnResponseEvent(
+                    payload=AgenticSystemTurnResponseStepCompletePayload(
+                        step_type=StepType.memory_retrieval.value,
+                        step_id=step_id,
+                        step_details=MemoryRetrievalStep(
+                            turn_id=turn_id,
+                            step_id=step_id,
+                            memory_bank_ids=bank_ids,
+                            inserted_context=rag_context or "",
+                        ),
+                    )
+                )
+            )
+
+            if rag_context:
+                last_message = input_messages[-1]
+                last_message.context = "\n".join(rag_context)
+
+        elif attachments and AgenticSystemTool.code_interpreter.value in enabled_tools:
+            urls = [a.content for a in attachments if isinstance(a.content, URL)]
+            msg = await attachment_message(self.tempdir, urls)
+            input_messages.append(msg)
+
+        output_attachments = []
+
+        n_iter = 0
+        while True:
+            msg = input_messages[-1]
+            if msg.role == Role.user.value:
+                color = "blue"
+            elif msg.role == Role.ipython.value:
+                color = "yellow"
+            else:
+                color = None
+            cprint(f"{str(msg)}", color=color)
+
+            step_id = str(uuid.uuid4())
+            yield AgenticSystemTurnResponseStreamChunk(
+                event=AgenticSystemTurnResponseEvent(
+                    payload=AgenticSystemTurnResponseStepStartPayload(
+                        step_type=StepType.inference.value,
+                        step_id=step_id,
+                    )
+                )
+            )
+
+            tool_calls = []
+            content = ""
+            stop_reason = None
+            async for chunk in self.inference_api.chat_completion(
+                self.agent_config.model,
+                input_messages,
+                tools=self._get_tools(),
+                tool_prompt_format=self.agent_config.tool_prompt_format,
+                stream=True,
+                sampling_params=sampling_params,
+            ):
+                event = chunk.event
+                if event.event_type == ChatCompletionResponseEventType.start:
+                    continue
+                elif event.event_type == ChatCompletionResponseEventType.complete:
+                    stop_reason = StopReason.end_of_turn
+                    continue
+
+                delta = event.delta
+                if isinstance(delta, ToolCallDelta):
+                    if delta.parse_status == ToolCallParseStatus.success:
+                        tool_calls.append(delta.content)
+
+                    if stream:
+                        yield AgenticSystemTurnResponseStreamChunk(
+                            event=AgenticSystemTurnResponseEvent(
+                                payload=AgenticSystemTurnResponseStepProgressPayload(
+                                    step_type=StepType.inference.value,
+                                    step_id=step_id,
+                                    model_response_text_delta="",
+                                    tool_call_delta=delta,
+                                )
+                            )
+                        )
+
+                elif isinstance(delta, str):
+                    content += delta
+                    if stream and event.stop_reason is None:
+                        yield AgenticSystemTurnResponseStreamChunk(
+                            event=AgenticSystemTurnResponseEvent(
+                                payload=AgenticSystemTurnResponseStepProgressPayload(
+                                    step_type=StepType.inference.value,
+                                    step_id=step_id,
+                                    model_response_text_delta=event.delta,
+                                )
+                            )
+                        )
+                else:
+                    raise ValueError(f"Unexpected delta type {type(delta)}")
+
+                if event.stop_reason is not None:
+                    stop_reason = event.stop_reason
+
+            stop_reason = stop_reason or StopReason.out_of_tokens
+            message = CompletionMessage(
+                content=content,
+                stop_reason=stop_reason,
+                tool_calls=tool_calls,
+            )
+
+            yield AgenticSystemTurnResponseStreamChunk(
+                event=AgenticSystemTurnResponseEvent(
+                    payload=AgenticSystemTurnResponseStepCompletePayload(
+                        step_type=StepType.inference.value,
+                        step_id=step_id,
+                        step_details=InferenceStep(
+                            # somewhere deep, we are re-assigning message or closing over some
+                            # variable which causes message to mutate later on. fix with a
+                            # `deepcopy` for now, but this is symptomatic of a deeper issue.
+                            step_id=step_id,
+                            turn_id=turn_id,
+                            model_response=copy.deepcopy(message),
+                        ),
+                    )
+                )
+            )
+
+            if n_iter >= self.max_infer_iters:
+                cprint("Done with MAX iterations, exiting.")
+                yield message
+                break
+
+            if stop_reason == StopReason.out_of_tokens:
+                cprint("Out of token budget, exiting.")
+                yield message
+                break
+
+            if len(message.tool_calls) == 0:
+                if stop_reason == StopReason.end_of_turn:
+                    # TODO: UPDATE RETURN TYPE TO SEND A TUPLE OF (MESSAGE, ATTACHMENTS)
+                    if len(output_attachments) > 0:
+                        if isinstance(message.content, list):
+                            message.content += attachments
+                        else:
+                            message.content = [message.content] + attachments
+                    yield message
+                else:
+                    cprint(f"Partial message: {str(message)}", color="green")
+                    input_messages = input_messages + [message]
+            else:
+                cprint(f"{str(message)}", color="green")
+                try:
+                    tool_call = message.tool_calls[0]
+
+                    name = tool_call.tool_name
+                    if not isinstance(name, BuiltinTool):
+                        yield message
+                        return
+
+                    step_id = str(uuid.uuid4())
+                    yield AgenticSystemTurnResponseStreamChunk(
+                        event=AgenticSystemTurnResponseEvent(
+                            payload=AgenticSystemTurnResponseStepStartPayload(
+                                step_type=StepType.tool_execution.value,
+                                step_id=step_id,
+                            )
+                        )
+                    )
+                    yield AgenticSystemTurnResponseStreamChunk(
+                        event=AgenticSystemTurnResponseEvent(
+                            payload=AgenticSystemTurnResponseStepProgressPayload(
+                                step_type=StepType.tool_execution.value,
+                                step_id=step_id,
+                                tool_call=tool_call,
+                            )
+                        )
+                    )
+
+                    result_messages = await execute_tool_call_maybe(
+                        self.tools_dict,
+                        [message],
+                    )
+                    assert (
+                        len(result_messages) == 1
+                    ), "Currently not supporting multiple messages"
+                    result_message = result_messages[0]
+
+                    yield AgenticSystemTurnResponseStreamChunk(
+                        event=AgenticSystemTurnResponseEvent(
+                            payload=AgenticSystemTurnResponseStepCompletePayload(
+                                step_type=StepType.tool_execution.value,
+                                step_details=ToolExecutionStep(
+                                    step_id=step_id,
+                                    turn_id=turn_id,
+                                    tool_calls=[tool_call],
+                                    tool_responses=[
+                                        ToolResponse(
+                                            call_id=result_message.call_id,
+                                            tool_name=result_message.tool_name,
+                                            content=result_message.content,
+                                        )
+                                    ],
+                                ),
+                            )
+                        )
+                    )
+
+                    # TODO: add tool-input touchpoint and a "start" event for this step also
+                    # but that needs a lot more refactoring of Tool code potentially
+                    yield AgenticSystemTurnResponseStreamChunk(
+                        event=AgenticSystemTurnResponseEvent(
+                            payload=AgenticSystemTurnResponseStepCompletePayload(
+                                step_type=StepType.shield_call.value,
+                                step_details=ShieldCallStep(
+                                    step_id=str(uuid.uuid4()),
+                                    turn_id=turn_id,
+                                    response=ShieldResponse(
+                                        # TODO: fix this, give each shield a shield type method and
+                                        # fire one event for each shield run
+                                        shield_type=BuiltinShield.llama_guard,
+                                        is_violation=False,
+                                    ),
+                                ),
+                            )
+                        )
+                    )
+
+                except SafetyException as e:
+                    yield AgenticSystemTurnResponseStreamChunk(
+                        event=AgenticSystemTurnResponseEvent(
+                            payload=AgenticSystemTurnResponseStepCompletePayload(
+                                step_type=StepType.shield_call.value,
+                                step_details=ShieldCallStep(
+                                    step_id=str(uuid.uuid4()),
+                                    turn_id=turn_id,
+                                    response=e.response,
+                                ),
+                            )
+                        )
+                    )
+
+                    yield CompletionMessage(
+                        content=str(e),
+                        stop_reason=StopReason.end_of_turn,
+                    )
+                    yield False
+                    return
+
+                if out_attachment := interpret_content_as_attachment(
+                    result_message.content
+                ):
+                    # NOTE: when we push this message back to the model, the model may ignore the
+                    # attached file path etc. since the model is trained to only provide a user message
+                    # with the summary. We keep all generated attachments and then attach them to final message
+                    output_attachments.append(out_attachment)
+
+                input_messages = input_messages + [message, result_message]
+
+            n_iter += 1
+
+    async def _ensure_memory_bank(self, session: Session) -> MemoryBank:
+        if session.memory_bank is None:
+            session.memory_bank = await self.memory_api.create_memory_bank(
+                name=f"memory_bank_{session.session_id}",
+                config=VectorMemoryBankConfig(
+                    embedding_model="sentence-transformer/all-MiniLM-L6-v2",
+                    chunk_size_in_tokens=512,
+                ),
+            )
+
+        return session.memory_bank
+
+    async def _should_retrieve_context(
+        self, messages: List[Message], attachments: List[Attachment]
+    ) -> bool:
+        enabled_tools = set(t.type for t in self.agent_config.tools)
+        if attachments:
+            if (
+                AgenticSystemTool.code_interpreter.value in enabled_tools
+                and self.agent_config.tool_choice == ToolChoice.required
+            ):
+                return False
+            else:
+                return True
+
+        return AgenticSystemTool.memory.value in enabled_tools
+
+    def _memory_tool_definition(self) -> Optional[MemoryToolDefinition]:
+        for t in self.agent_config.tools:
+            if t.type == AgenticSystemTool.memory.value:
+                return t
+
+        return None
+
+    async def _retrieve_context(
+        self, session: Session, messages: List[Message], attachments: List[Attachment]
+    ) -> Tuple[List[str], List[int]]:  # (rag_context, bank_ids)
+        bank_ids = []
+
+        memory = self._memory_tool_definition()
+        assert memory is not None, "Memory tool not configured"
+        bank_ids.extend(c.bank_id for c in memory.memory_bank_configs)
+
+        if attachments:
+            bank = await self._ensure_memory_bank(session)
+            bank_ids.append(bank.bank_id)
+
+            documents = [
+                MemoryBankDocument(
+                    document_id=str(uuid.uuid4()),
+                    content=a.content,
+                    mime_type=a.mime_type,
+                    metadata={},
+                )
+                for a in attachments
+            ]
+            await self.memory_api.insert_documents(bank.bank_id, documents)
+        elif session.memory_bank:
+            bank_ids.append(session.memory_bank.bank_id)
+
+        if not bank_ids:
+            # this can happen if the per-session memory bank is not yet populated
+            # (i.e., no prior turns uploaded an Attachment)
+            return None, []
+
+        query = await generate_rag_query(
+            memory.query_generator_config, messages, inference_api=self.inference_api
+        )
+        tasks = [
+            self.memory_api.query_documents(
+                bank_id=bank_id,
+                query=query,
+                params={
+                    "max_chunks": 5,
+                },
+            )
+            for bank_id in bank_ids
+        ]
+        results: List[QueryDocumentsResponse] = await asyncio.gather(*tasks)
+        chunks = [c for r in results for c in r.chunks]
+        scores = [s for r in results for s in r.scores]
+
+        # sort by score
+        chunks, scores = zip(
+            *sorted(zip(chunks, scores), key=lambda x: x[1], reverse=True)
+        )
+        if not chunks:
+            return None, bank_ids
+
+        tokens = 0
+        picked = []
+        for c in chunks[: memory.max_chunks]:
+            tokens += c.token_count
+            if tokens > memory.max_tokens_in_context:
+                cprint(
+                    f"Using {len(picked)} chunks; reached max tokens in context: {tokens}",
+                    "red",
+                )
+                break
+            picked.append(f"id:{c.document_id}; content:{c.content}")
+
+        return [
+            "Here are the retrieved documents for relevant context:\n=== START-RETRIEVED-CONTEXT ===\n",
+            *picked,
+            "\n=== END-RETRIEVED-CONTEXT ===\n",
+        ], bank_ids
+
+    def _get_tools(self) -> List[ToolDefinition]:
+        ret = []
+        for t in self.agent_config.tools:
+            if isinstance(t, SearchToolDefinition):
+                ret.append(ToolDefinition(tool_name=BuiltinTool.brave_search))
+            elif isinstance(t, WolframAlphaToolDefinition):
+                ret.append(ToolDefinition(tool_name=BuiltinTool.wolfram_alpha))
+            elif isinstance(t, PhotogenToolDefinition):
+                ret.append(ToolDefinition(tool_name=BuiltinTool.photogen))
+            elif isinstance(t, CodeInterpreterToolDefinition):
+                ret.append(ToolDefinition(tool_name=BuiltinTool.code_interpreter))
+            elif isinstance(t, FunctionCallToolDefinition):
+                ret.append(
+                    ToolDefinition(
+                        tool_name=t.function_name,
+                        description=t.description,
+                        parameters=t.parameters,
+                    )
+                )
+        return ret
+
+
+async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessage:
+    content = []
+
+    for url in urls:
+        uri = url.uri
+        if uri.startswith("file://"):
+            filepath = uri[len("file://") :]
+        elif uri.startswith("http"):
+            path = urlparse(uri).path
+            basename = os.path.basename(path)
+            filepath = f"{tempdir}/{make_random_string() + basename}"
+            print(f"Downloading {url} -> {filepath}")
+
+            async with httpx.AsyncClient() as client:
+                r = await client.get(uri)
+                resp = r.text
+                with open(filepath, "w") as fp:
+                    fp.write(resp)
+        else:
+            raise ValueError(f"Unsupported URL {url}")
+
+        content.append(f'# There is a file accessible to you at "{filepath}"\n')
+
+    return ToolResponseMessage(
+        call_id="",
+        tool_name=BuiltinTool.code_interpreter,
+        content=content,
+    )
+
+
+async def execute_tool_call_maybe(
+    tools_dict: Dict[str, BaseTool], messages: List[CompletionMessage]
+) -> List[ToolResponseMessage]:
+    # While Tools.run interface takes a list of messages,
+    # All tools currently only run on a single message
+    # When this changes, we can drop this assert
+    # Whether to call tools on each message and aggregate
+    # or aggregate and call tool once, reamins to be seen.
+    assert len(messages) == 1, "Expected single message"
+    message = messages[0]
+
+    tool_call = message.tool_calls[0]
+    name = tool_call.tool_name
+    assert isinstance(name, BuiltinTool)
+
+    name = name.value
+
+    assert name in tools_dict, f"Tool {name} not found"
+    tool = tools_dict[name]
+    result_messages = await tool.run(messages)
+    return result_messages
+
+
+def print_dialog(messages: List[Message]):
+    for i, m in enumerate(messages):
+        if m.role == Role.user.value:
+            color = "red"
+        elif m.role == Role.assistant.value:
+            color = "white"
+        elif m.role == Role.ipython.value:
+            color = "yellow"
+        elif m.role == Role.system.value:
+            color = "green"
+        else:
+            color = "white"
+
+        s = str(m)
+        cprint(f"{i} ::: {s[:100]}...", color=color)
--- a/llama_stack/agentic_system/meta_reference/agentic_system.py
+++ b/llama_stack/agentic_system/meta_reference/agentic_system.py
@ -0,0 +1,145 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import logging
+import tempfile
+import uuid
+from typing import AsyncGenerator
+
+from llama_stack.inference.api import Inference
+from llama_stack.memory.api import Memory
+from llama_stack.safety.api import Safety
+from llama_stack.agentic_system.api import *  # noqa: F403
+from llama_stack.tools.builtin import (
+    CodeInterpreterTool,
+    PhotogenTool,
+    SearchTool,
+    WolframAlphaTool,
+)
+from llama_stack.tools.safety import with_safety
+
+from .agent_instance import ChatAgent
+from .config import MetaReferenceImplConfig
+
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+AGENT_INSTANCES_BY_ID = {}
+
+
+class MetaReferenceAgenticSystemImpl(AgenticSystem):
+    def __init__(
+        self,
+        config: MetaReferenceImplConfig,
+        inference_api: Inference,
+        memory_api: Memory,
+        safety_api: Safety,
+    ):
+        self.config = config
+        self.inference_api = inference_api
+        self.memory_api = memory_api
+        self.safety_api = safety_api
+
+    async def initialize(self) -> None:
+        pass
+
+    async def create_agentic_system(
+        self,
+        agent_config: AgentConfig,
+    ) -> AgenticSystemCreateResponse:
+        agent_id = str(uuid.uuid4())
+
+        builtin_tools = []
+        for tool_defn in agent_config.tools:
+            if isinstance(tool_defn, WolframAlphaToolDefinition):
+                key = self.config.wolfram_api_key
+                if not key:
+                    raise ValueError("Wolfram API key not defined in config")
+                tool = WolframAlphaTool(key)
+            elif isinstance(tool_defn, SearchToolDefinition):
+                key = None
+                if tool_defn.engine == SearchEngineType.brave:
+                    key = self.config.brave_search_api_key
+                elif tool_defn.engine == SearchEngineType.bing:
+                    key = self.config.bing_search_api_key
+                if not key:
+                    raise ValueError("API key not defined in config")
+                tool = SearchTool(tool_defn.engine, key)
+            elif isinstance(tool_defn, CodeInterpreterToolDefinition):
+                tool = CodeInterpreterTool()
+            elif isinstance(tool_defn, PhotogenToolDefinition):
+                tool = PhotogenTool(dump_dir=tempfile.mkdtemp())
+            else:
+                continue
+
+            builtin_tools.append(
+                with_safety(
+                    tool,
+                    self.safety_api,
+                    tool_defn.input_shields,
+                    tool_defn.output_shields,
+                )
+            )
+
+        AGENT_INSTANCES_BY_ID[agent_id] = ChatAgent(
+            agent_config=agent_config,
+            inference_api=self.inference_api,
+            safety_api=self.safety_api,
+            memory_api=self.memory_api,
+            builtin_tools=builtin_tools,
+        )
+
+        return AgenticSystemCreateResponse(
+            agent_id=agent_id,
+        )
+
+    async def create_agentic_system_session(
+        self,
+        agent_id: str,
+        session_name: str,
+    ) -> AgenticSystemSessionCreateResponse:
+        assert agent_id in AGENT_INSTANCES_BY_ID, f"System {agent_id} not found"
+        agent = AGENT_INSTANCES_BY_ID[agent_id]
+
+        session = agent.create_session(session_name)
+        return AgenticSystemSessionCreateResponse(
+            session_id=session.session_id,
+        )
+
+    async def create_agentic_system_turn(
+        self,
+        agent_id: str,
+        session_id: str,
+        messages: List[
+            Union[
+                UserMessage,
+                ToolResponseMessage,
+            ]
+        ],
+        attachments: Optional[List[Attachment]] = None,
+        stream: Optional[bool] = False,
+    ) -> AsyncGenerator:
+        # wrapper request to make it easier to pass around (internal only, not exposed to API)
+        request = AgenticSystemTurnCreateRequest(
+            agent_id=agent_id,
+            session_id=session_id,
+            messages=messages,
+            attachments=attachments,
+            stream=stream,
+        )
+
+        agent_id = request.agent_id
+        assert agent_id in AGENT_INSTANCES_BY_ID, f"System {agent_id} not found"
+        agent = AGENT_INSTANCES_BY_ID[agent_id]
+
+        assert (
+            request.session_id in agent.sessions
+        ), f"Session {request.session_id} not found"
+        async for event in agent.create_and_execute_turn(request):
+            yield event
--- a/llama_stack/agentic_system/meta_reference/config.py
+++ b/llama_stack/agentic_system/meta_reference/config.py
@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class MetaReferenceImplConfig(BaseModel):
+    brave_search_api_key: Optional[str] = None
+    bing_search_api_key: Optional[str] = None
+    wolfram_api_key: Optional[str] = None
--- a/llama_stack/agentic_system/meta_reference/rag/context_retriever.py
+++ b/llama_stack/agentic_system/meta_reference/rag/context_retriever.py
@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from jinja2 import Template
+from llama_models.llama3.api import *  # noqa: F403
+
+
+from llama_stack.agentic_system.api import (
+    DefaultMemoryQueryGeneratorConfig,
+    LLMMemoryQueryGeneratorConfig,
+    MemoryQueryGenerator,
+    MemoryQueryGeneratorConfig,
+)
+from termcolor import cprint  # noqa: F401
+from llama_stack.inference.api import *  # noqa: F403
+
+
+async def generate_rag_query(
+    config: MemoryQueryGeneratorConfig,
+    messages: List[Message],
+    **kwargs,
+):
+    """
+    Generates a query that will be used for
+    retrieving relevant information from the memory bank.
+    """
+    if config.type == MemoryQueryGenerator.default.value:
+        query = await default_rag_query_generator(config, messages, **kwargs)
+    elif config.type == MemoryQueryGenerator.llm.value:
+        query = await llm_rag_query_generator(config, messages, **kwargs)
+    else:
+        raise NotImplementedError(f"Unsupported memory query generator {config.type}")
+    # cprint(f"Generated query >>>: {query}", color="green")
+    return query
+
+
+async def default_rag_query_generator(
+    config: DefaultMemoryQueryGeneratorConfig,
+    messages: List[Message],
+    **kwargs,
+):
+    return config.sep.join(interleaved_text_media_as_str(m.content) for m in messages)
+
+
+async def llm_rag_query_generator(
+    config: LLMMemoryQueryGeneratorConfig,
+    messages: List[Message],
+    **kwargs,
+):
+    assert "inference_api" in kwargs, "LLMRAGQueryGenerator needs inference_api"
+    inference_api = kwargs["inference_api"]
+
+    m_dict = {"messages": [m.model_dump() for m in messages]}
+
+    template = Template(config.template)
+    content = template.render(m_dict)
+
+    model = config.model
+    message = UserMessage(content=content)
+    response = inference_api.chat_completion(
+        ChatCompletionRequest(
+            model=model,
+            messages=[message],
+            stream=False,
+        )
+    )
+
+    async for chunk in response:
+        query = chunk.completion_message.content
+
+    return query
--- a/llama_stack/agentic_system/meta_reference/safety.py
+++ b/llama_stack/agentic_system/meta_reference/safety.py
@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_models.llama3.api.datatypes import Message, Role, UserMessage
+from termcolor import cprint
+
+from llama_stack.safety.api import (
+    OnViolationAction,
+    RunShieldRequest,
+    Safety,
+    ShieldDefinition,
+    ShieldResponse,
+)
+
+
+class SafetyException(Exception):  # noqa: N818
+    def __init__(self, response: ShieldResponse):
+        self.response = response
+        super().__init__(response.violation_return_message)
+
+
+class ShieldRunnerMixin:
+    def __init__(
+        self,
+        safety_api: Safety,
+        input_shields: List[ShieldDefinition] = None,
+        output_shields: List[ShieldDefinition] = None,
+    ):
+        self.safety_api = safety_api
+        self.input_shields = input_shields
+        self.output_shields = output_shields
+
+    async def run_shields(
+        self, messages: List[Message], shields: List[ShieldDefinition]
+    ) -> List[ShieldResponse]:
+        messages = messages.copy()
+        # some shields like llama-guard require the first message to be a user message
+        # since this might be a tool call, first role might not be user
+        if len(messages) > 0 and messages[0].role != Role.user.value:
+            messages[0] = UserMessage(content=messages[0].content)
+
+        res = await self.safety_api.run_shields(
+            RunShieldRequest(
+                messages=messages,
+                shields=shields,
+            )
+        )
+
+        results = res.responses
+        for shield, r in zip(shields, results):
+            if r.is_violation:
+                if shield.on_violation_action == OnViolationAction.RAISE:
+                    raise SafetyException(r)
+                elif shield.on_violation_action == OnViolationAction.WARN:
+                    cprint(
+                        f"[Warn]{shield.__class__.__name__} raised a warning",
+                        color="red",
+                    )
+
+        return results
--- a/llama_stack/agentic_system/providers.py
+++ b/llama_stack/agentic_system/providers.py
@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.core.datatypes import Api, InlineProviderSpec, ProviderSpec
+
+
+def available_providers() -> List[ProviderSpec]:
+    return [
+        InlineProviderSpec(
+            api=Api.agentic_system,
+            provider_id="meta-reference",
+            pip_packages=[
+                "codeshield",
+                "matplotlib",
+                "pillow",
+                "pandas",
+                "scikit-learn",
+                "torch",
+                "transformers",
+            ],
+            module="llama_stack.agentic_system.meta_reference",
+            config_class="llama_stack.agentic_system.meta_reference.MetaReferenceImplConfig",
+            api_dependencies=[
+                Api.inference,
+                Api.safety,
+                Api.memory,
+            ],
+        ),
+    ]
--- a/llama_stack/batch_inference/init.py
+++ b/llama_stack/batch_inference/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/batch_inference/api/init.py
+++ b/llama_stack/batch_inference/api/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .api import *  # noqa: F401 F403
--- a/llama_stack/batch_inference/api/api.py
+++ b/llama_stack/batch_inference/api/api.py
@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List, Optional, Protocol
+
+from llama_models.schema_utils import json_schema_type, webmethod
+
+from pydantic import BaseModel, Field
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.inference.api import *  # noqa: F403
+
+
+@json_schema_type
+class BatchCompletionRequest(BaseModel):
+    model: str
+    content_batch: List[InterleavedTextMedia]
+    sampling_params: Optional[SamplingParams] = SamplingParams()
+    logprobs: Optional[LogProbConfig] = None
+
+
+@json_schema_type
+class BatchCompletionResponse(BaseModel):
+    completion_message_batch: List[CompletionMessage]
+
+
+@json_schema_type
+class BatchChatCompletionRequest(BaseModel):
+    model: str
+    messages_batch: List[List[Message]]
+    sampling_params: Optional[SamplingParams] = SamplingParams()
+
+    # zero-shot tool definitions as input to the model
+    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
+    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
+    tool_prompt_format: Optional[ToolPromptFormat] = Field(
+        default=ToolPromptFormat.json
+    )
+    logprobs: Optional[LogProbConfig] = None
+
+
+@json_schema_type
+class BatchChatCompletionResponse(BaseModel):
+    completion_message_batch: List[CompletionMessage]
+
+
+class BatchInference(Protocol):
+    @webmethod(route="/batch_inference/completion")
+    async def batch_completion(
+        self,
+        model: str,
+        content_batch: List[InterleavedTextMedia],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> BatchCompletionResponse: ...
+
+    @webmethod(route="/batch_inference/chat_completion")
+    async def batch_chat_completion(
+        self,
+        model: str,
+        messages_batch: List[List[Message]],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        # zero-shot tool definitions as input to the model
+        tools: Optional[List[ToolDefinition]] = list,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> BatchChatCompletionResponse: ...
--- a/llama_stack/cli/init.py
+++ b/llama_stack/cli/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -0,0 +1,339 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import asyncio
+import json
+import os
+import shutil
+import time
+from datetime import datetime
+from functools import partial
+from pathlib import Path
+from typing import Dict, List
+
+import httpx
+from pydantic import BaseModel
+
+from termcolor import cprint
+
+from llama_stack.cli.subcommand import Subcommand
+
+
+class Download(Subcommand):
+    """Llama cli for downloading llama toolchain assets"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "download",
+            prog="llama download",
+            description="Download a model from llama.meta.com or Hugging Face Hub",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        setup_download_parser(self.parser)
+
+
+def setup_download_parser(parser: argparse.ArgumentParser) -> None:
+    from llama_models.sku_list import all_registered_models
+
+    models = all_registered_models()
+    parser.add_argument(
+        "--source",
+        choices=["meta", "huggingface"],
+        required=True,
+    )
+    parser.add_argument(
+        "--model-id",
+        required=False,
+        help="See `llama model list` or `llama model list --show-all` for the list of available models",
+    )
+    parser.add_argument(
+        "--hf-token",
+        type=str,
+        required=False,
+        default=None,
+        help="Hugging Face API token. Needed for gated models like llama2/3. Will also try to read environment variable `HF_TOKEN` as default.",
+    )
+    parser.add_argument(
+        "--meta-url",
+        type=str,
+        required=False,
+        help="For source=meta, URL obtained from llama.meta.com after accepting license terms",
+    )
+    parser.add_argument(
+        "--ignore-patterns",
+        type=str,
+        required=False,
+        default="*.safetensors",
+        help="""
+For source=huggingface, files matching any of the patterns are not downloaded. Defaults to ignoring
+safetensors files to avoid downloading duplicate weights.
+""",
+    )
+    parser.add_argument(
+        "--manifest-file",
+        type=str,
+        help="For source=meta, you can download models from a manifest file containing a file => URL mapping",
+        required=False,
+    )
+    parser.set_defaults(func=partial(run_download_cmd, parser=parser))
+
+
+def _hf_download(
+    model: "Model",
+    hf_token: str,
+    ignore_patterns: str,
+    parser: argparse.ArgumentParser,
+):
+    from huggingface_hub import snapshot_download
+    from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
+
+    from llama_stack.common.model_utils import model_local_dir
+
+    repo_id = model.huggingface_repo
+    if repo_id is None:
+        raise ValueError(f"No repo id found for model {model.descriptor()}")
+
+    output_dir = model_local_dir(model.descriptor())
+    os.makedirs(output_dir, exist_ok=True)
+    try:
+        true_output_dir = snapshot_download(
+            repo_id,
+            local_dir=output_dir,
+            ignore_patterns=ignore_patterns,
+            token=hf_token,
+            library_name="llama-toolchain",
+        )
+    except GatedRepoError:
+        parser.error(
+            "It looks like you are trying to access a gated repository. Please ensure you "
+            "have access to the repository and have provided the proper Hugging Face API token "
+            "using the option `--hf-token` or by running `huggingface-cli login`."
+            "You can find your token by visiting https://huggingface.co/settings/tokens"
+        )
+    except RepositoryNotFoundError:
+        parser.error(f"Repository '{args.repo_id}' not found on the Hugging Face Hub.")
+    except Exception as e:
+        parser.error(e)
+
+    print(f"\nSuccessfully downloaded model to {true_output_dir}")
+
+
+def _meta_download(model: "Model", meta_url: str):
+    from llama_models.sku_list import llama_meta_net_info
+
+    from llama_stack.common.model_utils import model_local_dir
+
+    output_dir = Path(model_local_dir(model.descriptor()))
+    os.makedirs(output_dir, exist_ok=True)
+
+    info = llama_meta_net_info(model)
+
+    # I believe we can use some concurrency here if needed but not sure it is worth it
+    for f in info.files:
+        output_file = str(output_dir / f)
+        url = meta_url.replace("*", f"{info.folder}/{f}")
+        total_size = info.pth_size if "consolidated" in f else 0
+        cprint(f"Downloading `{f}`...", "white")
+        downloader = ResumableDownloader(url, output_file, total_size)
+        asyncio.run(downloader.download())
+
+    print(f"\nSuccessfully downloaded model to {output_dir}")
+    cprint(f"\nMD5 Checksums are at: {output_dir / 'checklist.chk'}", "white")
+
+
+def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
+    from llama_models.sku_list import resolve_model
+
+    if args.manifest_file:
+        _download_from_manifest(args.manifest_file)
+        return
+
+    if args.model_id is None:
+        parser.error("Please provide a model id")
+        return
+
+    model = resolve_model(args.model_id)
+    if model is None:
+        parser.error(f"Model {args.model_id} not found")
+        return
+
+    if args.source == "huggingface":
+        _hf_download(model, args.hf_token, args.ignore_patterns, parser)
+    else:
+        meta_url = args.meta_url
+        if not meta_url:
+            meta_url = input(
+                "Please provide the signed URL you received via email (e.g., https://llama3-1.llamameta.net/*?Policy...): "
+            )
+            assert meta_url is not None and "llamameta.net" in meta_url
+        _meta_download(model, meta_url)
+
+
+class ModelEntry(BaseModel):
+    model_id: str
+    files: Dict[str, str]
+
+    class Config:
+        protected_namespaces = ()
+
+
+class Manifest(BaseModel):
+    models: List[ModelEntry]
+    expires_on: datetime
+
+
+def _download_from_manifest(manifest_file: str):
+    from llama_stack.common.model_utils import model_local_dir
+
+    with open(manifest_file, "r") as f:
+        d = json.load(f)
+        manifest = Manifest(**d)
+
+    if datetime.now() > manifest.expires_on:
+        raise ValueError(f"Manifest URLs have expired on {manifest.expires_on}")
+
+    for entry in manifest.models:
+        print(f"Downloading model {entry.model_id}...")
+        output_dir = Path(model_local_dir(entry.model_id))
+        os.makedirs(output_dir, exist_ok=True)
+
+        if any(output_dir.iterdir()):
+            cprint(f"Output directory {output_dir} is not empty.", "red")
+
+            while True:
+                resp = input(
+                    "Do you want to (C)ontinue download or (R)estart completely? (continue/restart): "
+                )
+                if resp.lower() == "restart" or resp.lower() == "r":
+                    shutil.rmtree(output_dir)
+                    os.makedirs(output_dir, exist_ok=True)
+                    break
+                elif resp.lower() == "continue" or resp.lower() == "c":
+                    print("Continuing download...")
+                    break
+                else:
+                    cprint("Invalid response. Please try again.", "red")
+
+        for fname, url in entry.files.items():
+            output_file = str(output_dir / fname)
+            downloader = ResumableDownloader(url, output_file)
+            asyncio.run(downloader.download())
+
+
+class ResumableDownloader:
+    def __init__(
+        self,
+        url: str,
+        output_file: str,
+        total_size: int = 0,
+        buffer_size: int = 32 * 1024,
+    ):
+        self.url = url
+        self.output_file = output_file
+        self.buffer_size = buffer_size
+        self.total_size = total_size
+        self.downloaded_size = 0
+        self.start_size = 0
+        self.start_time = 0
+
+    async def get_file_info(self, client: httpx.AsyncClient) -> None:
+        if self.total_size > 0:
+            return
+
+        # Force disable compression when trying to retrieve file size
+        response = await client.head(
+            self.url, follow_redirects=True, headers={"Accept-Encoding": "identity"}
+        )
+        response.raise_for_status()
+        self.url = str(response.url)  # Update URL in case of redirects
+        self.total_size = int(response.headers.get("Content-Length", 0))
+        if self.total_size == 0:
+            raise ValueError(
+                "Unable to determine file size. The server might not support range requests."
+            )
+
+    async def download(self) -> None:
+        self.start_time = time.time()
+        async with httpx.AsyncClient(follow_redirects=True) as client:
+            await self.get_file_info(client)
+
+            if os.path.exists(self.output_file):
+                self.downloaded_size = os.path.getsize(self.output_file)
+                self.start_size = self.downloaded_size
+                if self.downloaded_size >= self.total_size:
+                    print(f"Already downloaded `{self.output_file}`, skipping...")
+                    return
+
+            additional_size = self.total_size - self.downloaded_size
+            if not self.has_disk_space(additional_size):
+                M = 1024 * 1024  # noqa
+                print(
+                    f"Not enough disk space to download `{self.output_file}`. "
+                    f"Required: {(additional_size // M):.2f} MB"
+                )
+                raise ValueError(
+                    f"Not enough disk space to download `{self.output_file}`"
+                )
+
+            while True:
+                if self.downloaded_size >= self.total_size:
+                    break
+
+                # Cloudfront has a max-size limit
+                max_chunk_size = 27_000_000_000
+                request_size = min(
+                    self.total_size - self.downloaded_size, max_chunk_size
+                )
+                headers = {
+                    "Range": f"bytes={self.downloaded_size}-{self.downloaded_size + request_size}"
+                }
+                print(f"Downloading `{self.output_file}`....{headers}")
+                try:
+                    async with client.stream(
+                        "GET", self.url, headers=headers
+                    ) as response:
+                        response.raise_for_status()
+                        with open(self.output_file, "ab") as file:
+                            async for chunk in response.aiter_bytes(self.buffer_size):
+                                file.write(chunk)
+                                self.downloaded_size += len(chunk)
+                                self.print_progress()
+                except httpx.HTTPError as e:
+                    print(f"\nDownload interrupted: {e}")
+                    print("You can resume the download by running the script again.")
+                except Exception as e:
+                    print(f"\nAn error occurred: {e}")
+
+            print(f"\nFinished downloading `{self.output_file}`....")
+
+    def print_progress(self) -> None:
+        percent = (self.downloaded_size / self.total_size) * 100
+        bar_length = 50
+        filled_length = int(bar_length * self.downloaded_size // self.total_size)
+        bar = "█" * filled_length + "-" * (bar_length - filled_length)
+
+        elapsed_time = time.time() - self.start_time
+        M = 1024 * 1024  # noqa
+
+        speed = (
+            (self.downloaded_size - self.start_size) / (elapsed_time * M)
+            if elapsed_time > 0
+            else 0
+        )
+        print(
+            f"\rProgress: |{bar}| {percent:.2f}% "
+            f"({self.downloaded_size // M}/{self.total_size // M} MB) "
+            f"Speed: {speed:.2f} MiB/s",
+            end="",
+            flush=True,
+        )
+
+    def has_disk_space(self, file_size: int) -> bool:
+        dir_path = os.path.dirname(os.path.abspath(self.output_file))
+        free_space = shutil.disk_usage(dir_path).free
+        return free_space > file_size
--- a/llama_stack/cli/llama.py
+++ b/llama_stack/cli/llama.py
@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from .download import Download
+from .model import ModelParser
+from .stack import StackParser
+
+
+class LlamaCLIParser:
+    """Defines CLI parser for Llama CLI"""
+
+    def __init__(self):
+        self.parser = argparse.ArgumentParser(
+            prog="llama",
+            description="Welcome to the Llama CLI",
+            add_help=True,
+        )
+
+        # Default command is to print help
+        self.parser.set_defaults(func=lambda args: self.parser.print_help())
+
+        subparsers = self.parser.add_subparsers(title="subcommands")
+
+        # Add sub-commands
+        Download.create(subparsers)
+        ModelParser.create(subparsers)
+        StackParser.create(subparsers)
+
+        # Import sub-commands from agentic_system if they exist
+        try:
+            from llama_agentic_system.cli.subcommand_modules import SUBCOMMAND_MODULES
+
+            for module in SUBCOMMAND_MODULES:
+                module.create(subparsers)
+
+        except ImportError:
+            pass
+
+    def parse_args(self) -> argparse.Namespace:
+        return self.parser.parse_args()
+
+    def run(self, args: argparse.Namespace) -> None:
+        args.func(args)
+
+
+def main():
+    parser = LlamaCLIParser()
+    args = parser.parse_args()
+    parser.run(args)
+
+
+if __name__ == "__main__":
+    main()
--- a/llama_stack/cli/model/init.py
+++ b/llama_stack/cli/model/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .model import ModelParser  # noqa
--- a/llama_stack/cli/model/describe.py
+++ b/llama_stack/cli/model/describe.py
@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import json
+
+from llama_models.sku_list import resolve_model
+
+from termcolor import colored
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.cli.table import print_table
+from llama_stack.common.serialize import EnumEncoder
+
+
+class ModelDescribe(Subcommand):
+    """Show details about a model"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "describe",
+            prog="llama model describe",
+            description="Show details about a llama model",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_model_describe_cmd)
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "-m",
+            "--model-id",
+            type=str,
+            required=True,
+        )
+
+    def _run_model_describe_cmd(self, args: argparse.Namespace) -> None:
+        model = resolve_model(args.model_id)
+        if model is None:
+            self.parser.error(
+                f"Model {args.model_id} not found; try 'llama model list' for a list of available models."
+            )
+            return
+
+        rows = [
+            (
+                colored("Model", "white", attrs=["bold"]),
+                colored(model.descriptor(), "white", attrs=["bold"]),
+            ),
+            ("HuggingFace ID", model.huggingface_repo or "<Not Available>"),
+            ("Description", model.description_markdown),
+            ("Context Length", f"{model.max_seq_length // 1024}K tokens"),
+            ("Weights format", model.quantization_format.value),
+            ("Model params.json", json.dumps(model.model_args, indent=4)),
+        ]
+
+        if model.recommended_sampling_params is not None:
+            sampling_params = model.recommended_sampling_params.dict()
+            for k in ("max_tokens", "repetition_penalty"):
+                del sampling_params[k]
+            rows.append(
+                (
+                    "Recommended sampling params",
+                    json.dumps(sampling_params, cls=EnumEncoder, indent=4),
+                )
+            )
+
+        print_table(
+            rows,
+            separate_rows=True,
+        )
--- a/llama_stack/cli/model/download.py
+++ b/llama_stack/cli/model/download.py
@ -0,0 +1,24 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_stack.cli.subcommand import Subcommand
+
+
+class ModelDownload(Subcommand):
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "download",
+            prog="llama model download",
+            description="Download a model from llama.meta.com or Hugging Face Hub",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+
+        from llama_stack.cli.download import setup_download_parser
+
+        setup_download_parser(self.parser)
--- a/llama_stack/cli/model/list.py
+++ b/llama_stack/cli/model/list.py
@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_models.sku_list import all_registered_models
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.cli.table import print_table
+
+
+class ModelList(Subcommand):
+    """List available llama models"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "list",
+            prog="llama model list",
+            description="Show available llama models",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_model_list_cmd)
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "--show-all",
+            action="store_true",
+            help="Show all models (not just defaults)",
+        )
+
+    def _run_model_list_cmd(self, args: argparse.Namespace) -> None:
+        headers = [
+            "Model Descriptor",
+            "HuggingFace Repo",
+            "Context Length",
+        ]
+
+        rows = []
+        for model in all_registered_models():
+            if not args.show_all and not model.is_featured:
+                continue
+
+            descriptor = model.descriptor()
+            rows.append(
+                [
+                    descriptor,
+                    model.huggingface_repo,
+                    f"{model.max_seq_length // 1024}K",
+                ]
+            )
+        print_table(
+            rows,
+            headers,
+            separate_rows=True,
+        )
--- a/llama_stack/cli/model/model.py
+++ b/llama_stack/cli/model/model.py
@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_stack.cli.model.describe import ModelDescribe
+from llama_stack.cli.model.download import ModelDownload
+from llama_stack.cli.model.list import ModelList
+from llama_stack.cli.model.template import ModelTemplate
+
+from llama_stack.cli.subcommand import Subcommand
+
+
+class ModelParser(Subcommand):
+    """Llama cli for model interface apis"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "model",
+            prog="llama model",
+            description="Work with llama models",
+        )
+
+        subparsers = self.parser.add_subparsers(title="model_subcommands")
+
+        # Add sub-commands
+        ModelDownload.create(subparsers)
+        ModelList.create(subparsers)
+        ModelTemplate.create(subparsers)
+        ModelDescribe.create(subparsers)
--- a/llama_stack/cli/model/template.py
+++ b/llama_stack/cli/model/template.py
@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import textwrap
+
+from termcolor import colored
+
+from llama_stack.cli.subcommand import Subcommand
+
+
+class ModelTemplate(Subcommand):
+    """Llama model cli for describe a model template (message formats)"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "template",
+            prog="llama model template",
+            description="Show llama model message formats",
+            epilog=textwrap.dedent(
+                """
+                Example:
+                    llama model template <options>
+                """
+            ),
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_model_template_cmd)
+
+    def _prompt_type(self, value):
+        from llama_models.llama3.api.datatypes import ToolPromptFormat
+
+        try:
+            return ToolPromptFormat(value.lower())
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                f"{value} is not a valid ToolPromptFormat. Choose from {', '.join(t.value for t in ToolPromptFormat)}"
+            ) from None
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "-m",
+            "--model-family",
+            type=str,
+            default="llama3_1",
+            help="Model Family (llama3_1, llama3_X, etc.)",
+        )
+        self.parser.add_argument(
+            "--name",
+            type=str,
+            help="Usecase template name (system_message, user_message, assistant_message, tool_message)...",
+            required=False,
+        )
+        self.parser.add_argument(
+            "--format",
+            type=str,
+            help="ToolPromptFormat (json or function_tag). This flag is used to print the template in a specific formats.",
+            required=False,
+            default="json",
+        )
+        self.parser.add_argument(
+            "--raw",
+            action="store_true",
+            help="If set to true, don't pretty-print into a table. Useful to copy-paste.",
+        )
+
+    def _run_model_template_cmd(self, args: argparse.Namespace) -> None:
+        from llama_models.llama3.api.interface import (
+            list_jinja_templates,
+            render_jinja_template,
+        )
+
+        from llama_stack.cli.table import print_table
+
+        if args.name:
+            tool_prompt_format = self._prompt_type(args.format)
+            template, tokens_info = render_jinja_template(args.name, tool_prompt_format)
+            rendered = ""
+            for tok, is_special in tokens_info:
+                if is_special:
+                    rendered += colored(tok, "yellow", attrs=["bold"])
+                else:
+                    rendered += tok
+
+            if not args.raw:
+                rendered = rendered.replace("\n", "↵\n")
+                print_table(
+                    [
+                        (
+                            "Name",
+                            colored(template.template_name, "white", attrs=["bold"]),
+                        ),
+                        ("Template", rendered),
+                        ("Notes", template.notes),
+                    ],
+                    separate_rows=True,
+                )
+            else:
+                print("Template: ", template.template_name)
+                print("=" * 40)
+                print(rendered)
+        else:
+            templates = list_jinja_templates()
+            headers = ["Role", "Template Name"]
+            print_table(
+                [(t.role, t.template_name) for t in templates],
+                headers,
+            )
--- a/llama_stack/cli/scripts/init.py
+++ b/llama_stack/cli/scripts/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/cli/scripts/install-wheel-from-presigned.sh
+++ b/llama_stack/cli/scripts/install-wheel-from-presigned.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+if [ $# -eq 0 ]; then
+  echo "Please provide a URL as an argument."
+  exit 1
+fi
+
+URL=$1
+
+HEADERS_FILE=$(mktemp)
+curl -s -I "$URL" >"$HEADERS_FILE"
+FILENAME=$(grep -i "x-manifold-obj-canonicalpath:" "$HEADERS_FILE" | sed -E 's/.*nodes\/[^\/]+\/(.+)/\1/' | tr -d "\r\n")
+
+if [ -z "$FILENAME" ]; then
+  echo "Could not find the x-manifold-obj-canonicalpath header."
+  echo "HEADERS_FILE contents: "
+  cat "$HEADERS_FILE"
+  echo ""
+  exit 1
+fi
+
+echo "Downloading $FILENAME..."
+
+curl -s -L -o "$FILENAME" "$URL"
+
+echo "Installing $FILENAME..."
+pip install "$FILENAME"
+echo "Successfully installed $FILENAME"
+
+rm -f "$FILENAME"
--- a/llama_stack/cli/scripts/run.py
+++ b/llama_stack/cli/scripts/run.py
@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+import subprocess
+import sys
+
+
+def install_wheel_from_presigned():
+    file = "install-wheel-from-presigned.sh"
+    script_path = os.path.join(os.path.dirname(__file__), file)
+    try:
+        subprocess.run(["sh", script_path] + sys.argv[1:], check=True)
+    except Exception:
+        sys.exit(1)
--- a/llama_stack/cli/stack/init.py
+++ b/llama_stack/cli/stack/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .stack import StackParser  # noqa
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@ -0,0 +1,94 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.core.datatypes import *  # noqa: F403
+from pathlib import Path
+
+import yaml
+
+
+class StackBuild(Subcommand):
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "build",
+            prog="llama stack build",
+            description="Build a Llama stack container",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_stack_build_command)
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "config",
+            type=str,
+            help="Path to a config file to use for the build. You may find example configs in llama_stack/configs/distributions",
+        )
+
+        self.parser.add_argument(
+            "--name",
+            type=str,
+            help="Name of the llama stack build to override from template config",
+        )
+
+    def _run_stack_build_command_from_build_config(
+        self, build_config: BuildConfig
+    ) -> None:
+        import json
+        import os
+
+        from llama_stack.common.config_dirs import DISTRIBS_BASE_DIR
+        from llama_stack.common.serialize import EnumEncoder
+        from llama_stack.core.package import ApiInput, build_image, ImageType
+        from termcolor import cprint
+
+        # save build.yaml spec for building same distribution again
+        if build_config.image_type == ImageType.docker.value:
+            # docker needs build file to be in the llama-stack repo dir to be able to copy over to the image
+            llama_stack_path = Path(os.path.relpath(__file__)).parent.parent.parent
+            build_dir = (
+                llama_stack_path / "configs/distributions" / build_config.image_type
+            )
+        else:
+            build_dir = DISTRIBS_BASE_DIR / build_config.image_type
+
+        os.makedirs(build_dir, exist_ok=True)
+        build_file_path = build_dir / f"{build_config.name}-build.yaml"
+
+        with open(build_file_path, "w") as f:
+            to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder))
+            f.write(yaml.dump(to_write, sort_keys=False))
+
+        build_image(build_config, build_file_path)
+
+        cprint(
+            f"Build spec configuration saved at {str(build_file_path)}",
+            color="green",
+        )
+
+    def _run_stack_build_command(self, args: argparse.Namespace) -> None:
+        from llama_stack.common.prompt_for_config import prompt_for_config
+        from llama_stack.core.dynamic import instantiate_class_type
+
+        if not args.config:
+            self.parser.error(
+                "No config file specified. Please use `llama stack build /path/to/*-build.yaml`. Example config files can be found in llama_stack/configs/distributions"
+            )
+            return
+
+        with open(args.config, "r") as f:
+            try:
+                build_config = BuildConfig(**yaml.safe_load(f))
+            except Exception as e:
+                self.parser.error(f"Could not parse config file {args.config}: {e}")
+                return
+            if args.name:
+                build_config.name = args.name
+            self._run_stack_build_command_from_build_config(build_config)
--- a/llama_stack/cli/stack/configure.py
+++ b/llama_stack/cli/stack/configure.py
@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import json
+from pathlib import Path
+
+import pkg_resources
+
+import yaml
+from termcolor import cprint
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.common.config_dirs import BUILDS_BASE_DIR
+
+from llama_stack.common.exec import run_with_pty
+from llama_stack.core.datatypes import *  # noqa: F403
+import os
+
+
+class StackConfigure(Subcommand):
+    """Llama cli for configuring llama toolchain configs"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "configure",
+            prog="llama stack configure",
+            description="configure a llama stack distribution",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_stack_configure_cmd)
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "config",
+            type=str,
+            help="Path to the build config file (e.g. ~/.llama/builds/<image_type>/<name>-build.yaml). For docker, this could also be the name of the docker image. ",
+        )
+
+        self.parser.add_argument(
+            "--output-dir",
+            type=str,
+            help="Path to the output directory to store generated run.yaml config file. If not specified, will use ~/.llama/build/<image_type>/<name>-run.yaml",
+        )
+
+    def _run_stack_configure_cmd(self, args: argparse.Namespace) -> None:
+        from llama_stack.core.package import ImageType
+
+        docker_image = None
+        build_config_file = Path(args.config)
+        if not build_config_file.exists():
+            cprint(
+                f"Could not find {build_config_file}. Trying docker image name instead...",
+                color="green",
+            )
+            docker_image = args.config
+
+            builds_dir = BUILDS_BASE_DIR / ImageType.docker.value
+            if args.output_dir:
+                builds_dir = Path(output_dir)
+            os.makedirs(builds_dir, exist_ok=True)
+
+            script = pkg_resources.resource_filename(
+                "llama_stack", "core/configure_container.sh"
+            )
+            script_args = [script, docker_image, str(builds_dir)]
+
+            return_code = run_with_pty(script_args)
+
+            # we have regenerated the build config file with script, now check if it exists
+            if return_code != 0:
+                self.parser.error(
+                    f"Can not find {build_config_file}. Please run llama stack build first or check if docker image exists"
+                )
+
+            build_name = docker_image.removeprefix("llamastack-")
+            cprint(
+                f"YAML configuration has been written to {builds_dir / f'{build_name}-run.yaml'}",
+                color="green",
+            )
+            return
+
+        with open(build_config_file, "r") as f:
+            build_config = BuildConfig(**yaml.safe_load(f))
+
+        self._configure_llama_distribution(build_config, args.output_dir)
+
+    def _configure_llama_distribution(
+        self,
+        build_config: BuildConfig,
+        output_dir: Optional[str] = None,
+    ):
+        from llama_stack.common.serialize import EnumEncoder
+        from llama_stack.core.configure import configure_api_providers
+
+        builds_dir = BUILDS_BASE_DIR / build_config.image_type
+        if output_dir:
+            builds_dir = Path(output_dir)
+        os.makedirs(builds_dir, exist_ok=True)
+        image_name = build_config.name.replace("::", "-")
+        run_config_file = builds_dir / f"{image_name}-run.yaml"
+
+        if run_config_file.exists():
+            cprint(
+                f"Configuration already exists for {build_config.name}. Will overwrite...",
+                "yellow",
+                attrs=["bold"],
+            )
+            config = StackRunConfig(**yaml.safe_load(run_config_file.read_text()))
+        else:
+            config = StackRunConfig(
+                built_at=datetime.now(),
+                image_name=image_name,
+                apis_to_serve=[],
+                provider_map={},
+            )
+
+        config = configure_api_providers(config, build_config.distribution_spec)
+
+        config.docker_image = (
+            image_name if build_config.image_type == "docker" else None
+        )
+        config.conda_env = image_name if build_config.image_type == "conda" else None
+
+        with open(run_config_file, "w") as f:
+            to_write = json.loads(json.dumps(config.dict(), cls=EnumEncoder))
+            f.write(yaml.dump(to_write, sort_keys=False))
+
+        cprint(
+            f"> YAML configuration has been written to {run_config_file}",
+            color="blue",
+        )
--- a/llama_stack/cli/stack/list_apis.py
+++ b/llama_stack/cli/stack/list_apis.py
@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_stack.cli.subcommand import Subcommand
+
+
+class StackListApis(Subcommand):
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "list-apis",
+            prog="llama stack list-apis",
+            description="List APIs part of the Llama Stack implementation",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_apis_list_cmd)
+
+    def _add_arguments(self):
+        pass
+
+    def _run_apis_list_cmd(self, args: argparse.Namespace) -> None:
+        from llama_stack.cli.table import print_table
+        from llama_stack.core.distribution import stack_apis
+
+        # eventually, this should query a registry at llama.meta.com/llamastack/distributions
+        headers = [
+            "API",
+        ]
+
+        rows = []
+        for api in stack_apis():
+            rows.append(
+                [
+                    api.value,
+                ]
+            )
+        print_table(
+            rows,
+            headers,
+            separate_rows=True,
+        )
--- a/llama_stack/cli/stack/list_providers.py
+++ b/llama_stack/cli/stack/list_providers.py
@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_stack.cli.subcommand import Subcommand
+
+
+class StackListProviders(Subcommand):
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "list-providers",
+            prog="llama stack list-providers",
+            description="Show available Llama Stack Providers for an API",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_providers_list_cmd)
+
+    def _add_arguments(self):
+        from llama_stack.core.distribution import stack_apis
+
+        api_values = [a.value for a in stack_apis()]
+        self.parser.add_argument(
+            "api",
+            type=str,
+            choices=api_values,
+            help="API to list providers for (one of: {})".format(api_values),
+        )
+
+    def _run_providers_list_cmd(self, args: argparse.Namespace) -> None:
+        from llama_stack.cli.table import print_table
+        from llama_stack.core.distribution import Api, api_providers
+
+        all_providers = api_providers()
+        providers_for_api = all_providers[Api(args.api)]
+
+        # eventually, this should query a registry at llama.meta.com/llamastack/distributions
+        headers = [
+            "Provider Type",
+            "PIP Package Dependencies",
+        ]
+
+        rows = []
+        for spec in providers_for_api.values():
+            rows.append(
+                [
+                    spec.provider_id,
+                    ",".join(spec.pip_packages),
+                ]
+            )
+        print_table(
+            rows,
+            headers,
+            separate_rows=True,
+        )
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -0,0 +1,88 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from pathlib import Path
+
+import pkg_resources
+import yaml
+
+from llama_stack.cli.subcommand import Subcommand
+from llama_stack.core.datatypes import *  # noqa: F403
+
+
+class StackRun(Subcommand):
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "run",
+            prog="llama stack run",
+            description="""start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.""",
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_stack_run_cmd)
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "config",
+            type=str,
+            help="Path to config file to use for the run",
+        )
+        self.parser.add_argument(
+            "--port",
+            type=int,
+            help="Port to run the server on. Defaults to 5000",
+            default=5000,
+        )
+        self.parser.add_argument(
+            "--disable-ipv6",
+            action="store_true",
+            help="Disable IPv6 support",
+            default=False,
+        )
+
+    def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
+        from llama_stack.common.exec import run_with_pty
+
+        if not args.config:
+            self.parser.error("Must specify a config file to run")
+            return
+
+        path = args.config
+        config_file = Path(path)
+
+        if not config_file.exists():
+            self.parser.error(
+                f"File {str(config_file)} does not exist. Did you run `llama stack build`?"
+            )
+            return
+
+        with open(config_file, "r") as f:
+            config = StackRunConfig(**yaml.safe_load(f))
+
+        if config.docker_image:
+            script = pkg_resources.resource_filename(
+                "llama_stack",
+                "core/start_container.sh",
+            )
+            run_args = [script, config.docker_image]
+        else:
+            script = pkg_resources.resource_filename(
+                "llama_stack",
+                "core/start_conda_env.sh",
+            )
+            run_args = [
+                script,
+                config.conda_env,
+            ]
+
+        run_args.extend([str(config_file), str(args.port)])
+        if args.disable_ipv6:
+            run_args.append("--disable-ipv6")
+
+        run_with_pty(run_args)
--- a/llama_stack/cli/stack/stack.py
+++ b/llama_stack/cli/stack/stack.py
@ -0,0 +1,34 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_stack.cli.subcommand import Subcommand
+
+from .build import StackBuild
+from .configure import StackConfigure
+from .list_apis import StackListApis
+from .list_providers import StackListProviders
+from .run import StackRun
+
+
+class StackParser(Subcommand):
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "stack",
+            prog="llama stack",
+            description="Operations for the Llama Stack / Distributions",
+        )
+
+        subparsers = self.parser.add_subparsers(title="stack_subcommands")
+
+        # Add sub-commands
+        StackBuild.create(subparsers)
+        StackConfigure.create(subparsers)
+        StackListApis.create(subparsers)
+        StackListProviders.create(subparsers)
+        StackRun.create(subparsers)
--- a/llama_stack/cli/subcommand.py
+++ b/llama_stack/cli/subcommand.py
@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+class Subcommand:
+    """All llama cli subcommands must inherit from this class"""
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    @classmethod
+    def create(cls, *args, **kwargs):
+        return cls(*args, **kwargs)
+
+    def _add_arguments(self):
+        pass
--- a/llama_stack/cli/table.py
+++ b/llama_stack/cli/table.py
@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import re
+import textwrap
+
+from termcolor import cprint
+
+
+def strip_ansi_colors(text):
+    ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
+    return ansi_escape.sub("", text)
+
+
+def format_row(row, col_widths):
+    def wrap(text, width):
+        lines = []
+        for line in text.split("\n"):
+            if line.strip() == "":
+                lines.append("")
+            else:
+                lines.extend(
+                    textwrap.wrap(
+                        line, width, break_long_words=False, replace_whitespace=False
+                    )
+                )
+        return lines
+
+    wrapped = [wrap(item, width) for item, width in zip(row, col_widths)]
+    max_lines = max(len(subrow) for subrow in wrapped)
+
+    lines = []
+    for i in range(max_lines):
+        line = []
+        for cell_lines, width in zip(wrapped, col_widths):
+            value = cell_lines[i] if i < len(cell_lines) else ""
+            line.append(value + " " * (width - len(strip_ansi_colors(value))))
+        lines.append("| " + (" | ".join(line)) + " |")
+
+    return "\n".join(lines)
+
+
+def print_table(rows, headers=None, separate_rows: bool = False):
+    def itemlen(item):
+        return max([len(line) for line in strip_ansi_colors(item).split("\n")])
+
+    rows = [[x or "" for x in row] for row in rows]
+    if not headers:
+        col_widths = [max(itemlen(item) for item in col) for col in zip(*rows)]
+    else:
+        col_widths = [
+            max(
+                itemlen(header),
+                max(itemlen(item) for item in col),
+            )
+            for header, col in zip(headers, zip(*rows))
+        ]
+    col_widths = [min(w, 80) for w in col_widths]
+
+    header_line = "+".join("-" * (width + 2) for width in col_widths)
+    header_line = f"+{header_line}+"
+
+    if headers:
+        print(header_line)
+        cprint(format_row(headers, col_widths), "white", attrs=["bold"])
+
+    print(header_line)
+    for row in rows:
+        print(format_row(row, col_widths))
+        if separate_rows:
+            print(header_line)
+
+    if not separate_rows:
+        print(header_line)
--- a/llama_stack/common/init.py
+++ b/llama_stack/common/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/common/config_dirs.py
+++ b/llama_stack/common/config_dirs.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+from pathlib import Path
+
+
+LLAMA_STACK_CONFIG_DIR = Path(os.path.expanduser("~/.llama/"))
+
+DISTRIBS_BASE_DIR = LLAMA_STACK_CONFIG_DIR / "distributions"
+
+DEFAULT_CHECKPOINT_DIR = LLAMA_STACK_CONFIG_DIR / "checkpoints"
+
+BUILDS_BASE_DIR = LLAMA_STACK_CONFIG_DIR / "builds"
--- a/llama_stack/common/deployment_types.py
+++ b/llama_stack/common/deployment_types.py
@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import Enum
+from typing import Any, Dict, Optional
+
+from llama_models.llama3.api.datatypes import URL
+
+from llama_models.schema_utils import json_schema_type
+
+from pydantic import BaseModel
+
+
+@json_schema_type
+class RestAPIMethod(Enum):
+    GET = "GET"
+    POST = "POST"
+    PUT = "PUT"
+    DELETE = "DELETE"
+
+
+@json_schema_type
+class RestAPIExecutionConfig(BaseModel):
+    url: URL
+    method: RestAPIMethod
+    params: Optional[Dict[str, Any]] = None
+    headers: Optional[Dict[str, Any]] = None
+    body: Optional[Dict[str, Any]] = None
--- a/llama_stack/common/exec.py
+++ b/llama_stack/common/exec.py
@ -0,0 +1,105 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import errno
+import os
+import pty
+import select
+import signal
+import subprocess
+import sys
+import termios
+
+from termcolor import cprint
+
+
+# run a command in a pseudo-terminal, with interrupt handling,
+# useful when you want to run interactive things
+def run_with_pty(command):
+    master, slave = pty.openpty()
+
+    old_settings = termios.tcgetattr(sys.stdin)
+    original_sigint = signal.getsignal(signal.SIGINT)
+
+    ctrl_c_pressed = False
+
+    def sigint_handler(signum, frame):
+        nonlocal ctrl_c_pressed
+        ctrl_c_pressed = True
+        cprint("\nCtrl-C detected. Aborting...", "white", attrs=["bold"])
+
+    try:
+        # Set up the signal handler
+        signal.signal(signal.SIGINT, sigint_handler)
+
+        new_settings = termios.tcgetattr(sys.stdin)
+        new_settings[3] = new_settings[3] & ~termios.ECHO  # Disable echo
+        new_settings[3] = new_settings[3] & ~termios.ICANON  # Disable canonical mode
+        termios.tcsetattr(sys.stdin, termios.TCSADRAIN, new_settings)
+
+        process = subprocess.Popen(
+            command,
+            stdin=slave,
+            stdout=slave,
+            stderr=slave,
+            universal_newlines=True,
+            preexec_fn=os.setsid,
+        )
+
+        # Close the slave file descriptor as it's now owned by the subprocess
+        os.close(slave)
+
+        def handle_io():
+            while not ctrl_c_pressed:
+                try:
+                    rlist, _, _ = select.select([sys.stdin, master], [], [], 0.1)
+
+                    if sys.stdin in rlist:
+                        data = os.read(sys.stdin.fileno(), 1024)
+                        if not data:
+                            break
+                        os.write(master, data)
+
+                    if master in rlist:
+                        data = os.read(master, 1024)
+                        if not data:
+                            break
+                        sys.stdout.buffer.write(data)
+                        sys.stdout.flush()
+
+                except KeyboardInterrupt:
+                    # This will be raised when Ctrl+C is pressed
+                    break
+
+                if process.poll() is not None:
+                    break
+
+        handle_io()
+    except (EOFError, KeyboardInterrupt):
+        pass
+    except OSError as e:
+        if e.errno != errno.EIO:
+            raise
+    finally:
+        # Clean up
+        termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
+        signal.signal(signal.SIGINT, original_sigint)
+
+        os.close(master)
+        if process.poll() is None:
+            process.terminate()
+            process.wait()
+
+    return process.returncode
+
+
+def run_command(command):
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, error = process.communicate()
+    if process.returncode != 0:
+        print(f"Error: {error.decode('utf-8')}")
+        sys.exit(1)
+    return output.decode("utf-8")
--- a/llama_stack/common/model_utils.py
+++ b/llama_stack/common/model_utils.py
@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+
+from .config_dirs import DEFAULT_CHECKPOINT_DIR
+
+
+def model_local_dir(descriptor: str) -> str:
+    return os.path.join(DEFAULT_CHECKPOINT_DIR, descriptor)
--- a/llama_stack/common/prompt_for_config.py
+++ b/llama_stack/common/prompt_for_config.py
@ -0,0 +1,309 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import inspect
+import json
+from enum import Enum
+
+from typing import Any, get_args, get_origin, List, Literal, Optional, Type, Union
+
+from pydantic import BaseModel
+from pydantic.fields import FieldInfo
+from pydantic_core import PydanticUndefinedType
+
+from typing_extensions import Annotated
+
+
+def is_list_of_primitives(field_type):
+    """Check if a field type is a List of primitive types."""
+    origin = get_origin(field_type)
+    if origin is List or origin is list:
+        args = get_args(field_type)
+        if len(args) == 1 and args[0] in (int, float, str, bool):
+            return True
+    return False
+
+
+def is_basemodel_without_fields(typ):
+    return (
+        inspect.isclass(typ) and issubclass(typ, BaseModel) and len(typ.__fields__) == 0
+    )
+
+
+def can_recurse(typ):
+    return (
+        inspect.isclass(typ) and issubclass(typ, BaseModel) and len(typ.__fields__) > 0
+    )
+
+
+def get_literal_values(field):
+    """Extract literal values from a field if it's a Literal type."""
+    if get_origin(field.annotation) is Literal:
+        return get_args(field.annotation)
+    return None
+
+
+def is_optional(field_type):
+    """Check if a field type is Optional."""
+    return get_origin(field_type) is Union and type(None) in get_args(field_type)
+
+
+def get_non_none_type(field_type):
+    """Get the non-None type from an Optional type."""
+    return next(arg for arg in get_args(field_type) if arg is not type(None))
+
+
+def manually_validate_field(model: Type[BaseModel], field_name: str, value: Any):
+    validators = model.__pydantic_decorators__.field_validators
+    for _name, validator in validators.items():
+        if field_name in validator.info.fields:
+            validator.func(value)
+
+    return value
+
+
+def is_discriminated_union(typ) -> bool:
+    if isinstance(typ, FieldInfo):
+        return typ.discriminator
+    else:
+        if not (get_origin(typ) is Annotated):
+            return False
+        args = get_args(typ)
+        return len(args) >= 2 and args[1].discriminator
+
+
+def prompt_for_discriminated_union(
+    field_name,
+    typ,
+    existing_value,
+):
+    if isinstance(typ, FieldInfo):
+        inner_type = typ.annotation
+        discriminator = typ.discriminator
+    else:
+        args = get_args(typ)
+        inner_type = args[0]
+        discriminator = args[1].discriminator
+
+    union_types = get_args(inner_type)
+    # Find the discriminator field in each union type
+    type_map = {}
+    for t in union_types:
+        disc_field = t.__fields__[discriminator]
+        literal_values = get_literal_values(disc_field)
+        if literal_values:
+            for value in literal_values:
+                type_map[value] = t
+
+    while True:
+        discriminator_value = input(
+            f"Enter `{discriminator}` for {field_name} (options: {', '.join(type_map.keys())}): "
+        )
+        if discriminator_value in type_map:
+            chosen_type = type_map[discriminator_value]
+            print(f"\nConfiguring {chosen_type.__name__}:")
+
+            if existing_value and (
+                getattr(existing_value, discriminator) != discriminator_value
+            ):
+                existing_value = None
+
+            sub_config = prompt_for_config(chosen_type, existing_value)
+            # Set the discriminator field in the sub-config
+            setattr(sub_config, discriminator, discriminator_value)
+            return sub_config
+        else:
+            print(f"Invalid {discriminator}. Please try again.")
+
+
+# This is somewhat elaborate, but does not purport to be comprehensive in any way.
+# We should add handling for the most common cases to tide us over.
+#
+# doesn't support List[nested_class] yet or Dicts of any kind. needs a bunch of
+# unit tests for coverage.
+def prompt_for_config(
+    config_type: type[BaseModel], existing_config: Optional[BaseModel] = None
+) -> BaseModel:
+    """
+    Recursively prompt the user for configuration values based on a Pydantic BaseModel.
+
+    Args:
+        config_type: A Pydantic BaseModel class representing the configuration structure.
+
+    Returns:
+        An instance of the config_type with user-provided values.
+    """
+    config_data = {}
+
+    for field_name, field in config_type.__fields__.items():
+        field_type = field.annotation
+        existing_value = (
+            getattr(existing_config, field_name) if existing_config else None
+        )
+        if existing_value:
+            default_value = existing_value
+        else:
+            default_value = (
+                field.default
+                if not isinstance(field.default, PydanticUndefinedType)
+                else None
+            )
+        is_required = field.is_required
+
+        # Skip fields with Literal type
+        if get_origin(field_type) is Literal:
+            continue
+
+        # Skip fields with no type annotations
+        if is_basemodel_without_fields(field_type):
+            config_data[field_name] = field_type()
+            continue
+
+        if inspect.isclass(field_type) and issubclass(field_type, Enum):
+            prompt = f"Choose {field_name} (options: {', '.join(e.name for e in field_type)}):"
+            while True:
+                # this branch does not handle existing and default values yet
+                user_input = input(prompt + " ")
+                try:
+                    value = field_type[user_input]
+                    validated_value = manually_validate_field(config_type, field, value)
+                    config_data[field_name] = validated_value
+                    break
+                except KeyError:
+                    print(
+                        f"Invalid choice. Please choose from: {', '.join(e.name for e in field_type)}"
+                    )
+            continue
+
+        if is_discriminated_union(field):
+            config_data[field_name] = prompt_for_discriminated_union(
+                field_name, field, existing_value
+            )
+            continue
+
+        if is_optional(field_type) and can_recurse(get_non_none_type(field_type)):
+            prompt = f"Do you want to configure {field_name}? (y/n): "
+            if input(prompt).lower() == "n":
+                config_data[field_name] = None
+                continue
+            nested_type = get_non_none_type(field_type)
+            print(f"Entering sub-configuration for {field_name}:")
+            config_data[field_name] = prompt_for_config(nested_type, existing_value)
+        elif is_optional(field_type) and is_discriminated_union(
+            get_non_none_type(field_type)
+        ):
+            prompt = f"Do you want to configure {field_name}? (y/n): "
+            if input(prompt).lower() == "n":
+                config_data[field_name] = None
+                continue
+            nested_type = get_non_none_type(field_type)
+            config_data[field_name] = prompt_for_discriminated_union(
+                field_name,
+                nested_type,
+                existing_value,
+            )
+        elif can_recurse(field_type):
+            print(f"\nEntering sub-configuration for {field_name}:")
+            config_data[field_name] = prompt_for_config(
+                field_type,
+                existing_value,
+            )
+        else:
+            prompt = f"Enter value for {field_name}"
+            if existing_value is not None:
+                prompt += f" (existing: {existing_value})"
+            elif default_value is not None:
+                prompt += f" (default: {default_value})"
+            if is_optional(field_type):
+                prompt += " (optional)"
+            elif is_required:
+                prompt += " (required)"
+            prompt += ": "
+
+            while True:
+                user_input = input(prompt)
+                if user_input == "":
+                    if default_value is not None:
+                        config_data[field_name] = default_value
+                        break
+                    elif is_optional(field_type) or not is_required:
+                        config_data[field_name] = None
+                        break
+                    else:
+                        print("This field is required. Please provide a value.")
+                        continue
+                else:
+                    try:
+                        # Handle Optional types
+                        if is_optional(field_type):
+                            if user_input.lower() == "none":
+                                value = None
+                            else:
+                                field_type = get_non_none_type(field_type)
+                                value = user_input
+
+                        # Handle List of primitives
+                        elif is_list_of_primitives(field_type):
+                            try:
+                                value = json.loads(user_input)
+                                if not isinstance(value, list):
+                                    raise ValueError(
+                                        "Input must be a JSON-encoded list"
+                                    )
+                                element_type = get_args(field_type)[0]
+                                value = [element_type(item) for item in value]
+
+                            except json.JSONDecodeError:
+                                print(
+                                    "Invalid JSON. Please enter a valid JSON-encoded list."
+                                )
+                                continue
+                            except ValueError as e:
+                                print(f"{str(e)}")
+                                continue
+
+                        elif get_origin(field_type) is dict:
+                            try:
+                                value = json.loads(user_input)
+                                if not isinstance(value, dict):
+                                    raise ValueError(
+                                        "Input must be a JSON-encoded dictionary"
+                                    )
+
+                            except json.JSONDecodeError:
+                                print(
+                                    "Invalid JSON. Please enter a valid JSON-encoded dict."
+                                )
+                                continue
+
+                        # Convert the input to the correct type
+                        elif inspect.isclass(field_type) and issubclass(
+                            field_type, BaseModel
+                        ):
+                            # For nested BaseModels, we assume a dictionary-like string input
+                            import ast
+
+                            value = field_type(**ast.literal_eval(user_input))
+                        else:
+                            value = field_type(user_input)
+
+                    except ValueError:
+                        print(
+                            f"Invalid input. Expected type: {getattr(field_type, '__name__', str(field_type))}"
+                        )
+                        continue
+
+                try:
+                    # Validate the field using our manual validation function
+                    validated_value = manually_validate_field(
+                        config_type, field_name, value
+                    )
+                    config_data[field_name] = validated_value
+                    break
+                except ValueError as e:
+                    print(f"Validation error: {str(e)}")
+
+    return config_type(**config_data)
--- a/llama_stack/common/serialize.py
+++ b/llama_stack/common/serialize.py
@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+from datetime import datetime
+from enum import Enum
+
+
+class EnumEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, Enum):
+            return obj.value
+        elif isinstance(obj, datetime):
+            return obj.isoformat()
+        return super().default(obj)
--- a/llama_stack/common/training_types.py
+++ b/llama_stack/common/training_types.py
@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_models.llama3.api.datatypes import URL
+from llama_models.schema_utils import json_schema_type
+from pydantic import BaseModel
+
+
+@json_schema_type(schema={"description": "Checkpoint created during training runs"})
+class Checkpoint(BaseModel):
+    iters: int
+    path: URL
+    epoch: int
--- a/llama_stack/configs/distributions/conda/local-conda-example-build.yaml
+++ b/llama_stack/configs/distributions/conda/local-conda-example-build.yaml
@ -0,0 +1,10 @@
+name: local-conda-example
+distribution_spec:
+  description: Use code from `llama_stack` itself to serve all llama stack APIs
+  providers:
+    inference: meta-reference
+    memory: meta-reference-faiss
+    safety: meta-reference
+    agentic_system: meta-reference
+    telemetry: console
+image_type: conda
--- a/llama_stack/configs/distributions/conda/local-fireworks-conda-example-build.yaml
+++ b/llama_stack/configs/distributions/conda/local-fireworks-conda-example-build.yaml
@ -0,0 +1,10 @@
+name: local-fireworks-conda-example
+distribution_spec:
+  description: Use Fireworks.ai for running LLM inference
+  providers:
+    inference: remote::fireworks
+    memory: meta-reference-faiss
+    safety: meta-reference
+    agentic_system: meta-reference
+    telemetry: console
+image_type: conda
--- a/llama_stack/configs/distributions/conda/local-ollama-conda-example-build.yaml
+++ b/llama_stack/configs/distributions/conda/local-ollama-conda-example-build.yaml
@ -0,0 +1,10 @@
+name: local-ollama-conda-example
+distribution_spec:
+  description: Like local, but use ollama for running LLM inference
+  providers:
+    inference: remote::ollama
+    memory: meta-reference-faiss
+    safety: meta-reference
+    agentic_system: meta-reference
+    telemetry: console
+image_type: conda
--- a/llama_stack/configs/distributions/conda/local-tgi-conda-example-build.yaml
+++ b/llama_stack/configs/distributions/conda/local-tgi-conda-example-build.yaml
@ -0,0 +1,10 @@
+name: local-tgi-conda-example
+distribution_spec:
+  description: Use TGI (local or with Hugging Face Inference Endpoints for running LLM inference. When using HF Inference Endpoints, you must provide the name of the endpoint).
+  providers:
+    inference: remote::tgi
+    memory: meta-reference-faiss
+    safety: meta-reference
+    agentic_system: meta-reference
+    telemetry: console
+image_type: conda
--- a/llama_stack/configs/distributions/conda/local-together-conda-example-build.yaml
+++ b/llama_stack/configs/distributions/conda/local-together-conda-example-build.yaml
@ -0,0 +1,10 @@
+name: local-tgi-conda-example
+distribution_spec:
+  description: Use Together.ai for running LLM inference
+  providers:
+    inference: remote::together
+    memory: meta-reference-faiss
+    safety: meta-reference
+    agentic_system: meta-reference
+    telemetry: console
+image_type: conda
--- a/llama_stack/configs/distributions/docker/local-docker-example-build.yaml
+++ b/llama_stack/configs/distributions/docker/local-docker-example-build.yaml
@ -0,0 +1,10 @@
+name: local-docker-example
+distribution_spec:
+  description: Use code from `llama_stack` itself to serve all llama stack APIs
+  providers:
+    inference: meta-reference
+    memory: meta-reference-faiss
+    safety: meta-reference
+    agentic_system: meta-reference
+    telemetry: console
+image_type: docker
--- a/llama_stack/core/init.py
+++ b/llama_stack/core/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/core/build_conda_env.sh
+++ b/llama_stack/core/build_conda_env.sh
@ -0,0 +1,115 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
+LLAMA_TOOLCHAIN_DIR=${LLAMA_TOOLCHAIN_DIR:-}
+TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
+
+if [ -n "$LLAMA_TOOLCHAIN_DIR" ]; then
+  echo "Using llama-toolchain-dir=$LLAMA_TOOLCHAIN_DIR"
+fi
+if [ -n "$LLAMA_MODELS_DIR" ]; then
+  echo "Using llama-models-dir=$LLAMA_MODELS_DIR"
+fi
+
+set -euo pipefail
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <distribution_type> <build_name> <pip_dependencies>" >&2
+  echo "Example: $0 <distribution_type> mybuild 'numpy pandas scipy'" >&2
+  exit 1
+fi
+
+build_name="$1"
+env_name="llamastack-$build_name"
+pip_dependencies="$2"
+
+# Define color codes
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+# this is set if we actually create a new conda in which case we need to clean up
+ENVNAME=""
+
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+source "$SCRIPT_DIR/common.sh"
+
+ensure_conda_env_python310() {
+  local env_name="$1"
+  local pip_dependencies="$2"
+  local python_version="3.10"
+
+  # Check if conda command is available
+  if ! command -v conda &>/dev/null; then
+    printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
+    exit 1
+  fi
+
+  # Check if the environment exists
+  if conda env list | grep -q "^${env_name} "; then
+    printf "Conda environment '${env_name}' exists. Checking Python version...\n"
+
+    # Check Python version in the environment
+    current_version=$(conda run -n "${env_name}" python --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
+
+    if [ "$current_version" = "$python_version" ]; then
+      printf "Environment '${env_name}' already has Python ${python_version}. No action needed.\n"
+    else
+      printf "Updating environment '${env_name}' to Python ${python_version}...\n"
+      conda install -n "${env_name}" python="${python_version}" -y
+    fi
+  else
+    printf "Conda environment '${env_name}' does not exist. Creating with Python ${python_version}...\n"
+    conda create -n "${env_name}" python="${python_version}" -y
+
+    ENVNAME="${env_name}"
+    # setup_cleanup_handlers
+  fi
+
+  eval "$(conda shell.bash hook)"
+  conda deactivate && conda activate "${env_name}"
+
+  if [ -n "$TEST_PYPI_VERSION" ]; then
+    # these packages are damaged in test-pypi, so install them first
+    pip install fastapi libcst
+    pip install --extra-index-url https://test.pypi.org/simple/ llama-models==$TEST_PYPI_VERSION llama-toolchain==$TEST_PYPI_VERSION $pip_dependencies
+  else
+    # Re-installing llama-toolchain in the new conda environment
+    if [ -n "$LLAMA_TOOLCHAIN_DIR" ]; then
+      if [ ! -d "$LLAMA_TOOLCHAIN_DIR" ]; then
+        printf "${RED}Warning: LLAMA_TOOLCHAIN_DIR is set but directory does not exist: $LLAMA_TOOLCHAIN_DIR${NC}\n" >&2
+        exit 1
+      fi
+
+      printf "Installing from LLAMA_TOOLCHAIN_DIR: $LLAMA_TOOLCHAIN_DIR\n"
+      pip install --no-cache-dir -e "$LLAMA_TOOLCHAIN_DIR"
+    else
+      pip install --no-cache-dir llama-toolchain
+    fi
+
+    if [ -n "$LLAMA_MODELS_DIR" ]; then
+      if [ ! -d "$LLAMA_MODELS_DIR" ]; then
+        printf "${RED}Warning: LLAMA_MODELS_DIR is set but directory does not exist: $LLAMA_MODELS_DIR${NC}\n" >&2
+        exit 1
+      fi
+
+      printf "Installing from LLAMA_MODELS_DIR: $LLAMA_MODELS_DIR\n"
+      pip uninstall -y llama-models
+      pip install --no-cache-dir -e "$LLAMA_MODELS_DIR"
+    fi
+
+    # Install pip dependencies
+    if [ -n "$pip_dependencies" ]; then
+      printf "Installing pip dependencies: $pip_dependencies\n"
+      pip install $pip_dependencies
+    fi
+  fi
+}
+
+ensure_conda_env_python310 "$env_name" "$pip_dependencies"
--- a/llama_stack/core/build_container.sh
+++ b/llama_stack/core/build_container.sh
@ -0,0 +1,117 @@
+#!/bin/bash
+
+LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
+LLAMA_TOOLCHAIN_DIR=${LLAMA_TOOLCHAIN_DIR:-}
+TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
+
+if [ "$#" -ne 4 ]; then
+  echo "Usage: $0 <build_name> <docker_base> <pip_dependencies>
+  echo "Example: $0 my-fastapi-app python:3.9-slim 'fastapi uvicorn'
+  exit 1
+fi
+
+build_name="$1"
+image_name="llamastack-$build_name"
+docker_base=$2
+build_file_path=$3
+pip_dependencies=$4
+
+# Define color codes
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+set -euo pipefail
+
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+REPO_DIR=$(dirname $(dirname "$SCRIPT_DIR"))
+DOCKER_BINARY=${DOCKER_BINARY:-docker}
+DOCKER_OPTS=${DOCKER_OPTS:-}
+
+TEMP_DIR=$(mktemp -d)
+
+add_to_docker() {
+  local input
+  output_file="$TEMP_DIR/Dockerfile"
+  if [ -t 0 ]; then
+    printf '%s\n' "$1" >>"$output_file"
+  else
+    # If stdin is not a terminal, read from it (heredoc)
+    cat >>"$output_file"
+  fi
+}
+
+add_to_docker <<EOF
+FROM $docker_base
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y \
+       iputils-ping net-tools iproute2 dnsutils telnet \
+       curl wget telnet \
+       procps psmisc lsof \
+       traceroute \
+       bubblewrap \
+       && rm -rf /var/lib/apt/lists/*
+
+EOF
+
+toolchain_mount="/app/llama-toolchain-source"
+models_mount="/app/llama-models-source"
+
+if [ -n "$LLAMA_TOOLCHAIN_DIR" ]; then
+  if [ ! -d "$LLAMA_TOOLCHAIN_DIR" ]; then
+    echo "${RED}Warning: LLAMA_TOOLCHAIN_DIR is set but directory does not exist: $LLAMA_TOOLCHAIN_DIR${NC}" >&2
+    exit 1
+  fi
+  add_to_docker "RUN pip install $toolchain_mount"
+else
+  add_to_docker "RUN pip install llama-toolchain"
+fi
+
+if [ -n "$LLAMA_MODELS_DIR" ]; then
+  if [ ! -d "$LLAMA_MODELS_DIR" ]; then
+    echo "${RED}Warning: LLAMA_MODELS_DIR is set but directory does not exist: $LLAMA_MODELS_DIR${NC}" >&2
+    exit 1
+  fi
+
+  add_to_docker <<EOF
+RUN pip uninstall -y llama-models
+RUN pip install $models_mount
+
+EOF
+fi
+
+if [ -n "$pip_dependencies" ]; then
+  add_to_docker "RUN pip install $pip_dependencies"
+fi
+
+add_to_docker <<EOF
+
+# This would be good in production but for debugging flexibility lets not add it right now
+# We need a more solid production ready entrypoint.sh anyway
+#
+# ENTRYPOINT ["python", "-m", "llama_stack.core.server"]
+
+EOF
+
+add_to_docker "ADD $build_file_path ./llamastack-build.yaml"
+
+printf "Dockerfile created successfully in $TEMP_DIR/Dockerfile"
+cat $TEMP_DIR/Dockerfile
+printf "\n"
+
+mounts=""
+if [ -n "$LLAMA_TOOLCHAIN_DIR" ]; then
+  mounts="$mounts -v $(readlink -f $LLAMA_TOOLCHAIN_DIR):$toolchain_mount"
+fi
+if [ -n "$LLAMA_MODELS_DIR" ]; then
+  mounts="$mounts -v $(readlink -f $LLAMA_MODELS_DIR):$models_mount"
+fi
+set -x
+$DOCKER_BINARY build $DOCKER_OPTS -t $image_name -f "$TEMP_DIR/Dockerfile" "$REPO_DIR" $mounts
+set +x
+
+echo "You can run it with: podman run -p 8000:8000 $image_name"
+
+echo "Checking image builds..."
+podman run -it $image_name cat llamastack-build.yaml
--- a/llama_stack/core/common.sh
+++ b/llama_stack/core/common.sh
@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+cleanup() {
+  envname="$1"
+
+  set +x
+  echo "Cleaning up..."
+  conda deactivate
+  conda env remove --name $envname -y
+}
+
+handle_int() {
+  if [ -n $ENVNAME ]; then
+    cleanup $ENVNAME
+  fi
+  exit 1
+}
+
+handle_exit() {
+  if [ $? -ne 0 ]; then
+    echo -e "\033[1;31mABORTING.\033[0m"
+    if [ -n $ENVNAME ]; then
+      cleanup $ENVNAME
+    fi
+  fi
+}
+
+setup_cleanup_handlers() {
+  trap handle_int INT
+  trap handle_exit EXIT
+
+  __conda_setup="$('conda' 'shell.bash' 'hook' 2>/dev/null)"
+  eval "$__conda_setup"
+
+  conda deactivate
+}
--- a/llama_stack/core/configure.py
+++ b/llama_stack/core/configure.py
@ -0,0 +1,91 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from pydantic import BaseModel
+
+from llama_stack.core.datatypes import *  # noqa: F403
+from termcolor import cprint
+
+from llama_stack.common.prompt_for_config import prompt_for_config
+from llama_stack.core.distribution import api_providers, stack_apis
+from llama_stack.core.dynamic import instantiate_class_type
+
+
+# These are hacks so we can re-use the `prompt_for_config` utility
+# This needs a bunch of work to be made very user friendly.
+class ReqApis(BaseModel):
+    apis_to_serve: List[str]
+
+
+def make_routing_entry_type(config_class: Any):
+    class BaseModelWithConfig(BaseModel):
+        routing_key: str
+        config: config_class
+
+    return BaseModelWithConfig
+
+
+# TODO: make sure we can deal with existing configuration values correctly
+# instead of just overwriting them
+def configure_api_providers(
+    config: StackRunConfig, spec: DistributionSpec
+) -> StackRunConfig:
+    cprint("Configuring APIs to serve...", "white", attrs=["bold"])
+    print("Enter comma-separated list of APIs to serve:")
+
+    apis = config.apis_to_serve or list(spec.providers.keys())
+    apis = [a for a in apis if a != "telemetry"]
+    req_apis = ReqApis(
+        apis_to_serve=apis,
+    )
+    req_apis = prompt_for_config(ReqApis, req_apis)
+    config.apis_to_serve = req_apis.apis_to_serve
+    print("")
+
+    apis = [v.value for v in stack_apis()]
+    all_providers = api_providers()
+
+    apis_to_serve = req_apis.apis_to_serve + ["telemetry"]
+    for api_str in apis_to_serve:
+        if api_str not in apis:
+            raise ValueError(f"Unknown API `{api_str}`")
+
+        cprint(f"Configuring API `{api_str}`...\n", "white", attrs=["bold"])
+        api = Api(api_str)
+        if isinstance(spec.providers[api_str], list):
+            print(
+                "You have specified multiple providers for this API. We will configure a routing table now. For each provider, provide a routing key followed by provider configuration.\n"
+            )
+            routing_entries = []
+            for p in spec.providers[api_str]:
+                print(f"Configuring provider `{p}`...")
+                provider_spec = all_providers[api][p]
+                config_type = instantiate_class_type(provider_spec.config_class)
+
+                wrapper_type = make_routing_entry_type(config_type)
+                rt_entry = prompt_for_config(wrapper_type, None)
+
+                # TODO: we need to validate the routing keys
+                routing_entries.append(
+                    ProviderRoutingEntry(
+                        provider_id=p,
+                        routing_key=rt_entry.routing_key,
+                        config=rt_entry.config.dict(),
+                    )
+                )
+            config.provider_map[api_str] = routing_entries
+        else:
+            provider_spec = all_providers[api][spec.providers[api_str]]
+            config_type = instantiate_class_type(provider_spec.config_class)
+            cfg = prompt_for_config(config_type, None)
+            config.provider_map[api_str] = GenericProviderConfig(
+                provider_id=spec.providers[api_str],
+                config=cfg.dict(),
+            )
+
+    return config
--- a/llama_stack/core/configure_container.sh
+++ b/llama_stack/core/configure_container.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+error_handler() {
+  echo "Error occurred in script at line: ${1}" >&2
+  exit 1
+}
+
+trap 'error_handler ${LINENO}' ERR
+
+if [ $# -lt 2 ]; then
+  echo "Usage: $0 <container name> <build file path>"
+  exit 1
+fi
+
+docker_image="$1"
+host_build_dir="$2"
+container_build_dir="/app/builds"
+
+set -x
+podman run -it \
+  -v $host_build_dir:$container_build_dir \
+  $docker_image \
+  llama stack configure ./llamastack-build.yaml --output-dir $container_build_dir
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@ -0,0 +1,250 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from datetime import datetime
+from enum import Enum
+from typing import Any, Dict, List, Optional, Union
+
+from llama_models.schema_utils import json_schema_type
+
+from pydantic import BaseModel, Field, validator
+
+
+@json_schema_type
+class Api(Enum):
+    inference = "inference"
+    safety = "safety"
+    agentic_system = "agentic_system"
+    memory = "memory"
+    telemetry = "telemetry"
+
+
+@json_schema_type
+class ApiEndpoint(BaseModel):
+    route: str
+    method: str
+    name: str
+
+
+@json_schema_type
+class ProviderSpec(BaseModel):
+    api: Api
+    provider_id: str
+    config_class: str = Field(
+        ...,
+        description="Fully-qualified classname of the config for this provider",
+    )
+    api_dependencies: List[Api] = Field(
+        default_factory=list,
+        description="Higher-level API surfaces may depend on other providers to provide their functionality",
+    )
+
+
+@json_schema_type
+class RouterProviderSpec(ProviderSpec):
+    provider_id: str = "router"
+    config_class: str = ""
+
+    docker_image: Optional[str] = None
+
+    inner_specs: List[ProviderSpec]
+    module: str = Field(
+        ...,
+        description="""
+Fully-qualified name of the module to import. The module is expected to have:
+
+ - `get_router_impl(config, provider_specs, deps)`: returns the router implementation
+""",
+    )
+
+    @property
+    def pip_packages(self) -> List[str]:
+        raise AssertionError("Should not be called on RouterProviderSpec")
+
+
+class GenericProviderConfig(BaseModel):
+    provider_id: str
+    config: Dict[str, Any]
+
+
+@json_schema_type
+class AdapterSpec(BaseModel):
+    adapter_id: str = Field(
+        ...,
+        description="Unique identifier for this adapter",
+    )
+    module: str = Field(
+        ...,
+        description="""
+Fully-qualified name of the module to import. The module is expected to have:
+
+ - `get_adapter_impl(config, deps)`: returns the adapter implementation
+""",
+    )
+    pip_packages: List[str] = Field(
+        default_factory=list,
+        description="The pip dependencies needed for this implementation",
+    )
+    config_class: Optional[str] = Field(
+        default=None,
+        description="Fully-qualified classname of the config for this provider",
+    )
+
+
+@json_schema_type
+class InlineProviderSpec(ProviderSpec):
+    pip_packages: List[str] = Field(
+        default_factory=list,
+        description="The pip dependencies needed for this implementation",
+    )
+    docker_image: Optional[str] = Field(
+        default=None,
+        description="""
+The docker image to use for this implementation. If one is provided, pip_packages will be ignored.
+If a provider depends on other providers, the dependencies MUST NOT specify a docker image.
+""",
+    )
+    module: str = Field(
+        ...,
+        description="""
+Fully-qualified name of the module to import. The module is expected to have:
+
+ - `get_provider_impl(config, deps)`: returns the local implementation
+""",
+    )
+
+
+class RemoteProviderConfig(BaseModel):
+    url: str = Field(..., description="The URL for the provider")
+
+    @validator("url")
+    @classmethod
+    def validate_url(cls, url: str) -> str:
+        if not url.startswith("http"):
+            raise ValueError(f"URL must start with http: {url}")
+        return url.rstrip("/")
+
+
+def remote_provider_id(adapter_id: str) -> str:
+    return f"remote::{adapter_id}"
+
+
+@json_schema_type
+class RemoteProviderSpec(ProviderSpec):
+    adapter: Optional[AdapterSpec] = Field(
+        default=None,
+        description="""
+If some code is needed to convert the remote responses into Llama Stack compatible
+API responses, specify the adapter here. If not specified, it indicates the remote
+as being "Llama Stack compatible"
+""",
+    )
+
+    @property
+    def docker_image(self) -> Optional[str]:
+        return None
+
+    @property
+    def module(self) -> str:
+        if self.adapter:
+            return self.adapter.module
+        return f"llama_stack.{self.api.value}.client"
+
+    @property
+    def pip_packages(self) -> List[str]:
+        if self.adapter:
+            return self.adapter.pip_packages
+        return []
+
+
+# Can avoid this by using Pydantic computed_field
+def remote_provider_spec(
+    api: Api, adapter: Optional[AdapterSpec] = None
+) -> RemoteProviderSpec:
+    config_class = (
+        adapter.config_class
+        if adapter and adapter.config_class
+        else "llama_stack.core.datatypes.RemoteProviderConfig"
+    )
+    provider_id = remote_provider_id(adapter.adapter_id) if adapter else "remote"
+
+    return RemoteProviderSpec(
+        api=api, provider_id=provider_id, config_class=config_class, adapter=adapter
+    )
+
+
+@json_schema_type
+class DistributionSpec(BaseModel):
+    description: Optional[str] = Field(
+        default="",
+        description="Description of the distribution",
+    )
+    docker_image: Optional[str] = None
+    providers: Dict[str, Union[str, List[str]]] = Field(
+        default_factory=dict,
+        description="""
+Provider Types for each of the APIs provided by this distribution. If you
+select multiple providers, you should provide an appropriate 'routing_map'
+in the runtime configuration to help route to the correct provider.""",
+    )
+
+
+@json_schema_type
+class ProviderRoutingEntry(GenericProviderConfig):
+    routing_key: str
+
+
+ProviderMapEntry = Union[GenericProviderConfig, List[ProviderRoutingEntry]]
+
+
+@json_schema_type
+class StackRunConfig(BaseModel):
+    built_at: datetime
+
+    image_name: str = Field(
+        ...,
+        description="""
+Reference to the distribution this package refers to. For unregistered (adhoc) packages,
+this could be just a hash
+""",
+    )
+    docker_image: Optional[str] = Field(
+        default=None,
+        description="Reference to the docker image if this package refers to a container",
+    )
+    conda_env: Optional[str] = Field(
+        default=None,
+        description="Reference to the conda environment if this package refers to a conda environment",
+    )
+    apis_to_serve: List[str] = Field(
+        description="""
+The list of APIs to serve. If not specified, all APIs specified in the provider_map will be served""",
+    )
+    provider_map: Dict[str, ProviderMapEntry] = Field(
+        description="""
+Provider configurations for each of the APIs provided by this package.
+
+Given an API, you can specify a single provider or a "routing table". Each entry in the routing
+table has a (routing_key, provider_config) tuple. How the key is interpreted is API-specific.
+
+As examples:
+- the "inference" API interprets the routing_key as a "model"
+- the "memory" API interprets the routing_key as the type of a "memory bank"
+
+The key may support wild-cards alsothe routing_key to route to the correct provider.""",
+    )
+
+
+@json_schema_type
+class BuildConfig(BaseModel):
+    name: str
+    distribution_spec: DistributionSpec = Field(
+        description="The distribution spec to build including API providers. "
+    )
+    image_type: str = Field(
+        default="conda",
+        description="Type of package to build (conda | container)",
+    )
--- a/llama_stack/core/distribution.py
+++ b/llama_stack/core/distribution.py
@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import importlib
+import inspect
+from typing import Dict, List
+
+from llama_stack.agentic_system.api import AgenticSystem
+from llama_stack.inference.api import Inference
+from llama_stack.memory.api import Memory
+from llama_stack.safety.api import Safety
+from llama_stack.telemetry.api import Telemetry
+
+from .datatypes import Api, ApiEndpoint, ProviderSpec, remote_provider_spec
+
+# These are the dependencies needed by the distribution server.
+# `llama-toolchain` is automatically installed by the installation script.
+SERVER_DEPENDENCIES = [
+    "fastapi",
+    "uvicorn",
+]
+
+
+def stack_apis() -> List[Api]:
+    return [v for v in Api]
+
+
+def api_endpoints() -> Dict[Api, List[ApiEndpoint]]:
+    apis = {}
+
+    protocols = {
+        Api.inference: Inference,
+        Api.safety: Safety,
+        Api.agentic_system: AgenticSystem,
+        Api.memory: Memory,
+        Api.telemetry: Telemetry,
+    }
+
+    for api, protocol in protocols.items():
+        endpoints = []
+        protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
+
+        for name, method in protocol_methods:
+            if not hasattr(method, "__webmethod__"):
+                continue
+
+            webmethod = method.__webmethod__
+            route = webmethod.route
+
+            if webmethod.method == "GET":
+                method = "get"
+            elif webmethod.method == "DELETE":
+                method = "delete"
+            else:
+                method = "post"
+            endpoints.append(ApiEndpoint(route=route, method=method, name=name))
+
+        apis[api] = endpoints
+
+    return apis
+
+
+def api_providers() -> Dict[Api, Dict[str, ProviderSpec]]:
+    ret = {}
+    for api in stack_apis():
+        name = api.name.lower()
+        module = importlib.import_module(f"llama_stack.{name}.providers")
+        ret[api] = {
+            "remote": remote_provider_spec(api),
+            **{a.provider_id: a for a in module.available_providers()},
+        }
+
+    return ret
--- a/llama_stack/core/dynamic.py
+++ b/llama_stack/core/dynamic.py
@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import importlib
+from typing import Any, Dict
+
+from llama_stack.core.datatypes import *  # noqa: F403
+
+
+def instantiate_class_type(fully_qualified_name):
+    module_name, class_name = fully_qualified_name.rsplit(".", 1)
+    module = importlib.import_module(module_name)
+    return getattr(module, class_name)
+
+
+# returns a class implementing the protocol corresponding to the Api
+async def instantiate_provider(
+    provider_spec: ProviderSpec,
+    deps: Dict[str, Any],
+    provider_config: ProviderMapEntry,
+):
+    module = importlib.import_module(provider_spec.module)
+
+    args = []
+    if isinstance(provider_spec, RemoteProviderSpec):
+        if provider_spec.adapter:
+            method = "get_adapter_impl"
+        else:
+            method = "get_client_impl"
+
+        assert isinstance(provider_config, GenericProviderConfig)
+        config_type = instantiate_class_type(provider_spec.config_class)
+        config = config_type(**provider_config.config)
+        args = [config, deps]
+    elif isinstance(provider_spec, RouterProviderSpec):
+        method = "get_router_impl"
+
+        assert isinstance(provider_config, list)
+        inner_specs = {x.provider_id: x for x in provider_spec.inner_specs}
+        inner_impls = []
+        for routing_entry in provider_config:
+            impl = await instantiate_provider(
+                inner_specs[routing_entry.provider_id],
+                deps,
+                routing_entry,
+            )
+            inner_impls.append((routing_entry.routing_key, impl))
+
+        config = None
+        args = [inner_impls, deps]
+    else:
+        method = "get_provider_impl"
+
+        assert isinstance(provider_config, GenericProviderConfig)
+        config_type = instantiate_class_type(provider_spec.config_class)
+        config = config_type(**provider_config.config)
+        args = [config, deps]
+
+    fn = getattr(module, method)
+    impl = await fn(*args)
+    impl.__provider_spec__ = provider_spec
+    impl.__provider_config__ = config
+    return impl
--- a/llama_stack/core/package.py
+++ b/llama_stack/core/package.py
@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import Enum
+from typing import List, Optional
+
+import pkg_resources
+from pydantic import BaseModel
+
+from termcolor import cprint
+
+from llama_stack.common.exec import run_with_pty
+
+from llama_stack.core.datatypes import *  # noqa: F403
+from pathlib import Path
+
+from llama_stack.core.distribution import api_providers, SERVER_DEPENDENCIES
+
+
+class ImageType(Enum):
+    docker = "docker"
+    conda = "conda"
+
+
+class Dependencies(BaseModel):
+    pip_packages: List[str]
+    docker_image: Optional[str] = None
+
+
+class ApiInput(BaseModel):
+    api: Api
+    provider: str
+
+
+def build_image(build_config: BuildConfig, build_file_path: Path):
+    package_deps = Dependencies(
+        docker_image=build_config.distribution_spec.docker_image or "python:3.10-slim",
+        pip_packages=SERVER_DEPENDENCIES,
+    )
+
+    # extend package dependencies based on providers spec
+    all_providers = api_providers()
+    for (
+        api_str,
+        provider_or_providers,
+    ) in build_config.distribution_spec.providers.items():
+        providers_for_api = all_providers[Api(api_str)]
+
+        providers = (
+            provider_or_providers
+            if isinstance(provider_or_providers, list)
+            else [provider_or_providers]
+        )
+
+        for provider in providers:
+            if provider not in providers_for_api:
+                raise ValueError(
+                    f"Provider `{provider}` is not available for API `{api_str}`"
+                )
+
+            provider_spec = providers_for_api[provider]
+            package_deps.pip_packages.extend(provider_spec.pip_packages)
+            if provider_spec.docker_image:
+                raise ValueError("A stack's dependencies cannot have a docker image")
+
+    if build_config.image_type == ImageType.docker.value:
+        script = pkg_resources.resource_filename(
+            "llama_stack", "core/build_container.sh"
+        )
+        args = [
+            script,
+            build_config.name,
+            package_deps.docker_image,
+            str(build_file_path),
+            " ".join(package_deps.pip_packages),
+        ]
+    else:
+        script = pkg_resources.resource_filename(
+            "llama_stack", "core/build_conda_env.sh"
+        )
+        args = [
+            script,
+            build_config.name,
+            " ".join(package_deps.pip_packages),
+        ]
+
+    return_code = run_with_pty(args)
+    if return_code != 0:
+        cprint(
+            f"Failed to build target {build_config.name} with return code {return_code}",
+            color="red",
+        )
+        return
--- a/llama_stack/core/server.py
+++ b/llama_stack/core/server.py
@ -0,0 +1,392 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import inspect
+import json
+import signal
+import traceback
+
+from collections.abc import (
+    AsyncGenerator as AsyncGeneratorABC,
+    AsyncIterator as AsyncIteratorABC,
+)
+from contextlib import asynccontextmanager
+from ssl import SSLError
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Dict,
+    get_type_hints,
+    List,
+    Optional,
+    Set,
+)
+
+import fire
+import httpx
+import yaml
+
+from fastapi import Body, FastAPI, HTTPException, Request, Response
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse, StreamingResponse
+from fastapi.routing import APIRoute
+from pydantic import BaseModel, ValidationError
+from termcolor import cprint
+from typing_extensions import Annotated
+
+from llama_stack.telemetry.tracing import (
+    end_trace,
+    setup_logger,
+    SpanStatus,
+    start_trace,
+)
+from llama_stack.core.datatypes import *  # noqa: F403
+
+from .distribution import api_endpoints, api_providers
+from .dynamic import instantiate_provider
+
+
+def is_async_iterator_type(typ):
+    if hasattr(typ, "__origin__"):
+        origin = typ.__origin__
+        if isinstance(origin, type):
+            return issubclass(
+                origin,
+                (AsyncIterator, AsyncGenerator, AsyncIteratorABC, AsyncGeneratorABC),
+            )
+        return False
+    return isinstance(
+        typ, (AsyncIterator, AsyncGenerator, AsyncIteratorABC, AsyncGeneratorABC)
+    )
+
+
+def create_sse_event(data: Any) -> str:
+    if isinstance(data, BaseModel):
+        data = data.json()
+    else:
+        data = json.dumps(data)
+
+    return f"data: {data}\n\n"
+
+
+async def global_exception_handler(request: Request, exc: Exception):
+    traceback.print_exception(exc)
+    http_exc = translate_exception(exc)
+
+    return JSONResponse(
+        status_code=http_exc.status_code, content={"error": {"detail": http_exc.detail}}
+    )
+
+
+def translate_exception(exc: Exception) -> HTTPException:
+    if isinstance(exc, ValidationError):
+        return RequestValidationError(exc.raw_errors)
+
+    # Add more custom exception translations here
+    return HTTPException(status_code=500, detail="Internal server error")
+
+
+async def passthrough(
+    request: Request,
+    downstream_url: str,
+    downstream_headers: Optional[Dict[str, str]] = None,
+):
+    await start_trace(request.path, {"downstream_url": downstream_url})
+
+    headers = dict(request.headers)
+    headers.pop("host", None)
+    headers.update(downstream_headers or {})
+
+    content = await request.body()
+
+    client = httpx.AsyncClient()
+    erred = False
+    try:
+        req = client.build_request(
+            method=request.method,
+            url=downstream_url,
+            headers=headers,
+            content=content,
+            params=request.query_params,
+        )
+        response = await client.send(req, stream=True)
+
+        async def stream_response():
+            async for chunk in response.aiter_raw(chunk_size=64):
+                yield chunk
+
+            await response.aclose()
+            await client.aclose()
+
+        return StreamingResponse(
+            stream_response(),
+            status_code=response.status_code,
+            headers=dict(response.headers),
+            media_type=response.headers.get("content-type"),
+        )
+
+    except httpx.ReadTimeout:
+        erred = True
+        return Response(content="Downstream server timed out", status_code=504)
+    except httpx.NetworkError as e:
+        erred = True
+        return Response(content=f"Network error: {str(e)}", status_code=502)
+    except httpx.TooManyRedirects:
+        erred = True
+        return Response(content="Too many redirects", status_code=502)
+    except SSLError as e:
+        erred = True
+        return Response(content=f"SSL error: {str(e)}", status_code=502)
+    except httpx.HTTPStatusError as e:
+        erred = True
+        return Response(content=str(e), status_code=e.response.status_code)
+    except Exception as e:
+        erred = True
+        return Response(content=f"Unexpected error: {str(e)}", status_code=500)
+    finally:
+        await end_trace(SpanStatus.OK if not erred else SpanStatus.ERROR)
+
+
+def handle_sigint(*args, **kwargs):
+    print("SIGINT or CTRL-C detected. Exiting gracefully...")
+    loop = asyncio.get_event_loop()
+    for task in asyncio.all_tasks(loop):
+        task.cancel()
+    loop.stop()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    print("Starting up")
+    yield
+    print("Shutting down")
+
+
+def create_dynamic_passthrough(
+    downstream_url: str, downstream_headers: Optional[Dict[str, str]] = None
+):
+    async def endpoint(request: Request):
+        return await passthrough(request, downstream_url, downstream_headers)
+
+    return endpoint
+
+
+def create_dynamic_typed_route(func: Any, method: str):
+    hints = get_type_hints(func)
+    response_model = hints.get("return")
+
+    # NOTE: I think it is better to just add a method within each Api
+    # "Protocol" / adapter-impl to tell what sort of a response this request
+    # is going to produce. /chat_completion can produce a streaming or
+    # non-streaming response depending on if request.stream is True / False.
+    is_streaming = is_async_iterator_type(response_model)
+
+    if is_streaming:
+
+        async def endpoint(**kwargs):
+            await start_trace(func.__name__)
+
+            async def sse_generator(event_gen):
+                try:
+                    async for item in event_gen:
+                        yield create_sse_event(item)
+                        await asyncio.sleep(0.01)
+                except asyncio.CancelledError:
+                    print("Generator cancelled")
+                    await event_gen.aclose()
+                except Exception as e:
+                    traceback.print_exception(e)
+                    yield create_sse_event(
+                        {
+                            "error": {
+                                "message": str(translate_exception(e)),
+                            },
+                        }
+                    )
+                finally:
+                    await end_trace()
+
+            return StreamingResponse(
+                sse_generator(func(**kwargs)), media_type="text/event-stream"
+            )
+
+    else:
+
+        async def endpoint(**kwargs):
+            await start_trace(func.__name__)
+            try:
+                return (
+                    await func(**kwargs)
+                    if asyncio.iscoroutinefunction(func)
+                    else func(**kwargs)
+                )
+            except Exception as e:
+                traceback.print_exception(e)
+                raise translate_exception(e) from e
+            finally:
+                await end_trace()
+
+    sig = inspect.signature(func)
+    if method == "post":
+        # make sure every parameter is annotated with Body() so FASTAPI doesn't
+        # do anything too intelligent and ask for some parameters in the query
+        # and some in the body
+        endpoint.__signature__ = sig.replace(
+            parameters=[
+                param.replace(
+                    annotation=Annotated[param.annotation, Body(..., embed=True)]
+                )
+                for param in sig.parameters.values()
+            ]
+        )
+    else:
+        endpoint.__signature__ = sig
+
+    return endpoint
+
+
+def topological_sort(providers: List[ProviderSpec]) -> List[ProviderSpec]:
+    by_id = {x.api: x for x in providers}
+
+    def dfs(a: ProviderSpec, visited: Set[Api], stack: List[Api]):
+        visited.add(a.api)
+
+        for api in a.api_dependencies:
+            if api not in visited:
+                dfs(by_id[api], visited, stack)
+
+        stack.append(a.api)
+
+    visited = set()
+    stack = []
+
+    for a in providers:
+        if a.api not in visited:
+            dfs(a, visited, stack)
+
+    return [by_id[x] for x in stack]
+
+
+def snake_to_camel(snake_str):
+    return "".join(word.capitalize() for word in snake_str.split("_"))
+
+
+async def resolve_impls(
+    provider_map: Dict[str, ProviderMapEntry],
+) -> Dict[Api, Any]:
+    """
+    Does two things:
+    - flatmaps, sorts and resolves the providers in dependency order
+    - for each API, produces either a (local, passthrough or router) implementation
+    """
+    all_providers = api_providers()
+
+    specs = {}
+    for api_str, item in provider_map.items():
+        api = Api(api_str)
+        providers = all_providers[api]
+
+        if isinstance(item, GenericProviderConfig):
+            if item.provider_id not in providers:
+                raise ValueError(
+                    f"Unknown provider `{provider_id}` is not available for API `{api}`"
+                )
+            specs[api] = providers[item.provider_id]
+        else:
+            assert isinstance(item, list)
+            inner_specs = []
+            for rt_entry in item:
+                if rt_entry.provider_id not in providers:
+                    raise ValueError(
+                        f"Unknown provider `{rt_entry.provider_id}` is not available for API `{api}`"
+                    )
+                inner_specs.append(providers[rt_entry.provider_id])
+
+            specs[api] = RouterProviderSpec(
+                api=api,
+                module=f"llama_stack.{api.value.lower()}.router",
+                api_dependencies=[],
+                inner_specs=inner_specs,
+            )
+
+    sorted_specs = topological_sort(specs.values())
+
+    impls = {}
+    for spec in sorted_specs:
+        api = spec.api
+
+        deps = {api: impls[api] for api in spec.api_dependencies}
+        impl = await instantiate_provider(spec, deps, provider_map[api.value])
+        impls[api] = impl
+
+    return impls, specs
+
+
+def main(yaml_config: str, port: int = 5000, disable_ipv6: bool = False):
+    with open(yaml_config, "r") as fp:
+        config = StackRunConfig(**yaml.safe_load(fp))
+
+    app = FastAPI()
+
+    impls, specs = asyncio.run(resolve_impls(config.provider_map))
+    if Api.telemetry in impls:
+        setup_logger(impls[Api.telemetry])
+
+    all_endpoints = api_endpoints()
+
+    apis_to_serve = config.apis_to_serve or list(config.provider_map.keys())
+    for api_str in apis_to_serve:
+        api = Api(api_str)
+        endpoints = all_endpoints[api]
+        impl = impls[api]
+
+        provider_spec = specs[api]
+        if (
+            isinstance(provider_spec, RemoteProviderSpec)
+            and provider_spec.adapter is None
+        ):
+            for endpoint in endpoints:
+                url = impl.__provider_config__.url.rstrip("/") + endpoint.route
+                getattr(app, endpoint.method)(endpoint.route)(
+                    create_dynamic_passthrough(url)
+                )
+        else:
+            for endpoint in endpoints:
+                if not hasattr(impl, endpoint.name):
+                    # ideally this should be a typing violation already
+                    raise ValueError(
+                        f"Could not find method {endpoint.name} on {impl}!!"
+                    )
+
+                impl_method = getattr(impl, endpoint.name)
+                getattr(app, endpoint.method)(endpoint.route, response_model=None)(
+                    create_dynamic_typed_route(impl_method, endpoint.method)
+                )
+
+    for route in app.routes:
+        if isinstance(route, APIRoute):
+            cprint(
+                f"Serving {next(iter(route.methods))} {route.path}",
+                "white",
+                attrs=["bold"],
+            )
+
+    app.exception_handler(RequestValidationError)(global_exception_handler)
+    app.exception_handler(Exception)(global_exception_handler)
+    signal.signal(signal.SIGINT, handle_sigint)
+
+    import uvicorn
+
+    # FYI this does not do hot-reloads
+    listen_host = "::" if not disable_ipv6 else "0.0.0.0"
+    print(f"Listening on {listen_host}:{port}")
+    uvicorn.run(app, host=listen_host, port=port)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/llama_stack/core/start_conda_env.sh
+++ b/llama_stack/core/start_conda_env.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+error_handler() {
+  echo "Error occurred in script at line: ${1}" >&2
+  exit 1
+}
+
+trap 'error_handler ${LINENO}' ERR
+
+if [ $# -lt 3 ]; then
+  echo "Usage: $0 <build_name> <yaml_config> <port> <script_args...>"
+  exit 1
+fi
+
+build_name="$1"
+env_name="llamastack-$build_name"
+shift
+
+yaml_config="$1"
+shift
+
+port="$1"
+shift
+
+eval "$(conda shell.bash hook)"
+conda deactivate && conda activate "$env_name"
+
+$CONDA_PREFIX/bin/python \
+  -m llama_stack.core.server \
+  --yaml_config "$yaml_config" \
+  --port "$port" "$@"
--- a/llama_stack/core/start_container.sh
+++ b/llama_stack/core/start_container.sh
@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+error_handler() {
+  echo "Error occurred in script at line: ${1}" >&2
+  exit 1
+}
+
+trap 'error_handler ${LINENO}' ERR
+
+if [ $# -lt 3 ]; then
+  echo "Usage: $0 <build_name> <yaml_config> <port> <other_args...>"
+  exit 1
+fi
+
+build_name="$1"
+docker_image="llamastack-$build_name"
+shift
+
+yaml_config="$1"
+shift
+
+port="$1"
+shift
+
+set -x
+podman run -it \
+  -p $port:$port \
+  -v "$yaml_config:/app/config.yaml" \
+  $docker_image \
+  python -m llama_stack.core.server \
+  --yaml_config /app/config.yaml \
+  --port $port "$@"
--- a/llama_stack/dataset/api/init.py
+++ b/llama_stack/dataset/api/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .api import *  # noqa: F401 F403
--- a/llama_stack/dataset/api/api.py
+++ b/llama_stack/dataset/api/api.py
@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import Enum
+from typing import Any, Dict, Optional, Protocol
+
+from llama_models.llama3.api.datatypes import URL
+
+from llama_models.schema_utils import json_schema_type, webmethod
+
+from pydantic import BaseModel
+
+
+@json_schema_type
+class TrainEvalDatasetColumnType(Enum):
+    dialog = "dialog"
+    text = "text"
+    media = "media"
+    number = "number"
+    json = "json"
+
+
+@json_schema_type
+class TrainEvalDataset(BaseModel):
+    """Dataset to be used for training or evaluating language models."""
+
+    # TODO(ashwin): figure out if we need to add an enum for a "dataset type"
+
+    columns: Dict[str, TrainEvalDatasetColumnType]
+    content_url: URL
+    metadata: Optional[Dict[str, Any]] = None
+
+
+@json_schema_type
+class CreateDatasetRequest(BaseModel):
+    """Request to create a dataset."""
+
+    uuid: str
+    dataset: TrainEvalDataset
+
+
+class Datasets(Protocol):
+    @webmethod(route="/datasets/create")
+    def create_dataset(
+        self,
+        uuid: str,
+        dataset: TrainEvalDataset,
+    ) -> None: ...
+
+    @webmethod(route="/datasets/get")
+    def get_dataset(
+        self,
+        dataset_uuid: str,
+    ) -> TrainEvalDataset: ...
+
+    @webmethod(route="/datasets/delete")
+    def delete_dataset(
+        self,
+        dataset_uuid: str,
+    ) -> None: ...
--- a/llama_stack/evaluations/api/init.py
+++ b/llama_stack/evaluations/api/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .api import *  # noqa: F401 F403
--- a/llama_stack/evaluations/api/api.py
+++ b/llama_stack/evaluations/api/api.py
@ -0,0 +1,122 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import Enum
+from typing import List, Protocol
+
+from llama_models.schema_utils import webmethod
+
+from pydantic import BaseModel
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.dataset.api import *  # noqa: F403
+from llama_stack.common.training_types import *  # noqa: F403
+
+
+class TextGenerationMetric(Enum):
+    perplexity = "perplexity"
+    rouge = "rouge"
+    bleu = "bleu"
+
+
+class QuestionAnsweringMetric(Enum):
+    em = "em"
+    f1 = "f1"
+
+
+class SummarizationMetric(Enum):
+    rouge = "rouge"
+    bleu = "bleu"
+
+
+class EvaluationJob(BaseModel):
+    job_uuid: str
+
+
+class EvaluationJobLogStream(BaseModel):
+    job_uuid: str
+
+
+class EvaluateTaskRequestCommon(BaseModel):
+    job_uuid: str
+    dataset: TrainEvalDataset
+
+    checkpoint: Checkpoint
+
+    # generation params
+    sampling_params: SamplingParams = SamplingParams()
+
+
+@json_schema_type
+class EvaluateTextGenerationRequest(EvaluateTaskRequestCommon):
+    """Request to evaluate text generation."""
+
+    metrics: List[TextGenerationMetric]
+
+
+@json_schema_type
+class EvaluateQuestionAnsweringRequest(EvaluateTaskRequestCommon):
+    """Request to evaluate question answering."""
+
+    metrics: List[QuestionAnsweringMetric]
+
+
+@json_schema_type
+class EvaluateSummarizationRequest(EvaluateTaskRequestCommon):
+    """Request to evaluate summarization."""
+
+    metrics: List[SummarizationMetric]
+
+
+class EvaluationJobStatusResponse(BaseModel):
+    job_uuid: str
+
+
+@json_schema_type
+class EvaluationJobArtifactsResponse(BaseModel):
+    """Artifacts of a evaluation job."""
+
+    job_uuid: str
+
+
+class Evaluations(Protocol):
+    @webmethod(route="/evaluate/text_generation/")
+    def evaluate_text_generation(
+        self,
+        metrics: List[TextGenerationMetric],
+    ) -> EvaluationJob: ...
+
+    @webmethod(route="/evaluate/question_answering/")
+    def evaluate_question_answering(
+        self,
+        metrics: List[QuestionAnsweringMetric],
+    ) -> EvaluationJob: ...
+
+    @webmethod(route="/evaluate/summarization/")
+    def evaluate_summarization(
+        self,
+        metrics: List[SummarizationMetric],
+    ) -> EvaluationJob: ...
+
+    @webmethod(route="/evaluate/jobs")
+    def get_evaluation_jobs(self) -> List[EvaluationJob]: ...
+
+    @webmethod(route="/evaluate/job/status")
+    def get_evaluation_job_status(
+        self, job_uuid: str
+    ) -> EvaluationJobStatusResponse: ...
+
+    # sends SSE stream of logs
+    @webmethod(route="/evaluate/job/logs")
+    def get_evaluation_job_logstream(self, job_uuid: str) -> EvaluationJobLogStream: ...
+
+    @webmethod(route="/evaluate/job/cancel")
+    def cancel_evaluation_job(self, job_uuid: str) -> None: ...
+
+    @webmethod(route="/evaluate/job/artifacts")
+    def get_evaluation_job_artifacts(
+        self, job_uuid: str
+    ) -> EvaluationJobArtifactsResponse: ...
--- a/llama_stack/inference/init.py
+++ b/llama_stack/inference/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/inference/adapters/init.py
+++ b/llama_stack/inference/adapters/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/inference/adapters/fireworks/init.py
+++ b/llama_stack/inference/adapters/fireworks/init.py
@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import FireworksImplConfig
+
+
+async def get_adapter_impl(config: FireworksImplConfig, _deps):
+    from .fireworks import FireworksInferenceAdapter
+
+    assert isinstance(
+        config, FireworksImplConfig
+    ), f"Unexpected config type: {type(config)}"
+    impl = FireworksInferenceAdapter(config)
+    await impl.initialize()
+    return impl
--- a/llama_stack/inference/adapters/fireworks/config.py
+++ b/llama_stack/inference/adapters/fireworks/config.py
@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_models.schema_utils import json_schema_type
+from pydantic import BaseModel, Field
+
+
+@json_schema_type
+class FireworksImplConfig(BaseModel):
+    url: str = Field(
+        default="https://api.fireworks.ai/inference",
+        description="The URL for the Fireworks server",
+    )
+    api_key: str = Field(
+        default="",
+        description="The Fireworks.ai API Key",
+    )
--- a/llama_stack/inference/adapters/fireworks/fireworks.py
+++ b/llama_stack/inference/adapters/fireworks/fireworks.py
@ -0,0 +1,244 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import AsyncGenerator
+
+from fireworks.client import Fireworks
+from llama_models.llama3.api.chat_format import ChatFormat
+
+from llama_models.llama3.api.datatypes import Message, StopReason
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.sku_list import resolve_model
+
+from llama_stack.inference.api import *  # noqa: F403
+from llama_stack.inference.prepare_messages import prepare_messages
+
+from .config import FireworksImplConfig
+
+FIREWORKS_SUPPORTED_MODELS = {
+    "Meta-Llama3.1-8B-Instruct": "fireworks/llama-v3p1-8b-instruct",
+    "Meta-Llama3.1-70B-Instruct": "fireworks/llama-v3p1-70b-instruct",
+    "Meta-Llama3.1-405B-Instruct": "fireworks/llama-v3p1-405b-instruct",
+}
+
+
+class FireworksInferenceAdapter(Inference):
+    def __init__(self, config: FireworksImplConfig) -> None:
+        self.config = config
+        tokenizer = Tokenizer.get_instance()
+        self.formatter = ChatFormat(tokenizer)
+
+    @property
+    def client(self) -> Fireworks:
+        return Fireworks(api_key=self.config.api_key)
+
+    async def initialize(self) -> None:
+        return
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
+        raise NotImplementedError()
+
+    def _messages_to_fireworks_messages(self, messages: list[Message]) -> list:
+        fireworks_messages = []
+        for message in messages:
+            if message.role == "ipython":
+                role = "tool"
+            else:
+                role = message.role
+            fireworks_messages.append({"role": role, "content": message.content})
+
+        return fireworks_messages
+
+    def resolve_fireworks_model(self, model_name: str) -> str:
+        model = resolve_model(model_name)
+        assert (
+            model is not None
+            and model.descriptor(shorten_default_variant=True)
+            in FIREWORKS_SUPPORTED_MODELS
+        ), f"Unsupported model: {model_name}, use one of the supported models: {','.join(FIREWORKS_SUPPORTED_MODELS.keys())}"
+
+        return FIREWORKS_SUPPORTED_MODELS.get(
+            model.descriptor(shorten_default_variant=True)
+        )
+
+    def get_fireworks_chat_options(self, request: ChatCompletionRequest) -> dict:
+        options = {}
+        if request.sampling_params is not None:
+            for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
+                if getattr(request.sampling_params, attr):
+                    options[attr] = getattr(request.sampling_params, attr)
+
+        return options
+
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = list(),
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        messages = prepare_messages(request)
+
+        # accumulate sampling params and other options to pass to fireworks
+        options = self.get_fireworks_chat_options(request)
+        fireworks_model = self.resolve_fireworks_model(request.model)
+
+        if not request.stream:
+            r = await self.client.chat.completions.acreate(
+                model=fireworks_model,
+                messages=self._messages_to_fireworks_messages(messages),
+                stream=False,
+                **options,
+            )
+            stop_reason = None
+            if r.choices[0].finish_reason:
+                if r.choices[0].finish_reason == "stop":
+                    stop_reason = StopReason.end_of_turn
+                elif r.choices[0].finish_reason == "length":
+                    stop_reason = StopReason.out_of_tokens
+
+            completion_message = self.formatter.decode_assistant_message_from_content(
+                r.choices[0].message.content, stop_reason
+            )
+
+            yield ChatCompletionResponse(
+                completion_message=completion_message,
+                logprobs=None,
+            )
+        else:
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.start,
+                    delta="",
+                )
+            )
+
+            buffer = ""
+            ipython = False
+            stop_reason = None
+
+            async for chunk in self.client.chat.completions.acreate(
+                model=fireworks_model,
+                messages=self._messages_to_fireworks_messages(messages),
+                stream=True,
+                **options,
+            ):
+                if chunk.choices[0].finish_reason:
+                    if stop_reason is None and chunk.choices[0].finish_reason == "stop":
+                        stop_reason = StopReason.end_of_turn
+                    elif (
+                        stop_reason is None
+                        and chunk.choices[0].finish_reason == "length"
+                    ):
+                        stop_reason = StopReason.out_of_tokens
+                    break
+
+                text = chunk.choices[0].delta.content
+                if text is None:
+                    continue
+
+                # check if its a tool call ( aka starts with <|python_tag|> )
+                if not ipython and text.startswith("<|python_tag|>"):
+                    ipython = True
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.started,
+                            ),
+                        )
+                    )
+                    buffer += text
+                    continue
+
+                if ipython:
+                    if text == "<|eot_id|>":
+                        stop_reason = StopReason.end_of_turn
+                        text = ""
+                        continue
+                    elif text == "<|eom_id|>":
+                        stop_reason = StopReason.end_of_message
+                        text = ""
+                        continue
+
+                    buffer += text
+                    delta = ToolCallDelta(
+                        content=text,
+                        parse_status=ToolCallParseStatus.in_progress,
+                    )
+
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=delta,
+                            stop_reason=stop_reason,
+                        )
+                    )
+                else:
+                    buffer += text
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=text,
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+            # parse tool calls and report errors
+            message = self.formatter.decode_assistant_message_from_content(
+                buffer, stop_reason
+            )
+            parsed_tool_calls = len(message.tool_calls) > 0
+            if ipython and not parsed_tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content="",
+                            parse_status=ToolCallParseStatus.failure,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            for tool_call in message.tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content=tool_call,
+                            parse_status=ToolCallParseStatus.success,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.complete,
+                    delta="",
+                    stop_reason=stop_reason,
+                )
+            )
--- a/llama_stack/inference/adapters/ollama/init.py
+++ b/llama_stack/inference/adapters/ollama/init.py
@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.core.datatypes import RemoteProviderConfig
+
+
+async def get_adapter_impl(config: RemoteProviderConfig, _deps):
+    from .ollama import OllamaInferenceAdapter
+
+    impl = OllamaInferenceAdapter(config.url)
+    await impl.initialize()
+    return impl
--- a/llama_stack/inference/adapters/ollama/ollama.py
+++ b/llama_stack/inference/adapters/ollama/ollama.py
@ -0,0 +1,260 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import AsyncGenerator
+
+import httpx
+
+from llama_models.llama3.api.chat_format import ChatFormat
+from llama_models.llama3.api.datatypes import Message, StopReason
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.sku_list import resolve_model
+from ollama import AsyncClient
+
+from llama_stack.inference.api import *  # noqa: F403
+from llama_stack.inference.prepare_messages import prepare_messages
+
+# TODO: Eventually this will move to the llama cli model list command
+# mapping of Model SKUs to ollama models
+OLLAMA_SUPPORTED_SKUS = {
+    # "Meta-Llama3.1-8B-Instruct": "llama3.1",
+    "Meta-Llama3.1-8B-Instruct": "llama3.1:8b-instruct-fp16",
+    "Meta-Llama3.1-70B-Instruct": "llama3.1:70b-instruct-fp16",
+}
+
+
+class OllamaInferenceAdapter(Inference):
+    def __init__(self, url: str) -> None:
+        self.url = url
+        tokenizer = Tokenizer.get_instance()
+        self.formatter = ChatFormat(tokenizer)
+
+    @property
+    def client(self) -> AsyncClient:
+        return AsyncClient(host=self.url)
+
+    async def initialize(self) -> None:
+        try:
+            await self.client.ps()
+        except httpx.ConnectError as e:
+            raise RuntimeError(
+                "Ollama Server is not running, start it using `ollama serve` in a separate terminal"
+            ) from e
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
+        raise NotImplementedError()
+
+    def _messages_to_ollama_messages(self, messages: list[Message]) -> list:
+        ollama_messages = []
+        for message in messages:
+            if message.role == "ipython":
+                role = "tool"
+            else:
+                role = message.role
+            ollama_messages.append({"role": role, "content": message.content})
+
+        return ollama_messages
+
+    def resolve_ollama_model(self, model_name: str) -> str:
+        model = resolve_model(model_name)
+        assert (
+            model is not None
+            and model.descriptor(shorten_default_variant=True) in OLLAMA_SUPPORTED_SKUS
+        ), f"Unsupported model: {model_name}, use one of the supported models: {','.join(OLLAMA_SUPPORTED_SKUS.keys())}"
+
+        return OLLAMA_SUPPORTED_SKUS.get(model.descriptor(shorten_default_variant=True))
+
+    def get_ollama_chat_options(self, request: ChatCompletionRequest) -> dict:
+        options = {}
+        if request.sampling_params is not None:
+            for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
+                if getattr(request.sampling_params, attr):
+                    options[attr] = getattr(request.sampling_params, attr)
+            if (
+                request.sampling_params.repetition_penalty is not None
+                and request.sampling_params.repetition_penalty != 1.0
+            ):
+                options["repeat_penalty"] = request.sampling_params.repetition_penalty
+
+        return options
+
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = list(),
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        messages = prepare_messages(request)
+        # accumulate sampling params and other options to pass to ollama
+        options = self.get_ollama_chat_options(request)
+        ollama_model = self.resolve_ollama_model(request.model)
+
+        res = await self.client.ps()
+        need_model_pull = True
+        for r in res["models"]:
+            if ollama_model == r["model"]:
+                need_model_pull = False
+                break
+
+        if need_model_pull:
+            print(f"Pulling model: {ollama_model}")
+            status = await self.client.pull(ollama_model)
+            assert (
+                status["status"] == "success"
+            ), f"Failed to pull model {self.model} in ollama"
+
+        if not request.stream:
+            r = await self.client.chat(
+                model=ollama_model,
+                messages=self._messages_to_ollama_messages(messages),
+                stream=False,
+                options=options,
+            )
+            stop_reason = None
+            if r["done"]:
+                if r["done_reason"] == "stop":
+                    stop_reason = StopReason.end_of_turn
+                elif r["done_reason"] == "length":
+                    stop_reason = StopReason.out_of_tokens
+
+            completion_message = self.formatter.decode_assistant_message_from_content(
+                r["message"]["content"], stop_reason
+            )
+            yield ChatCompletionResponse(
+                completion_message=completion_message,
+                logprobs=None,
+            )
+        else:
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.start,
+                    delta="",
+                )
+            )
+            stream = await self.client.chat(
+                model=ollama_model,
+                messages=self._messages_to_ollama_messages(messages),
+                stream=True,
+                options=options,
+            )
+
+            buffer = ""
+            ipython = False
+            stop_reason = None
+
+            async for chunk in stream:
+                if chunk["done"]:
+                    if stop_reason is None and chunk["done_reason"] == "stop":
+                        stop_reason = StopReason.end_of_turn
+                    elif stop_reason is None and chunk["done_reason"] == "length":
+                        stop_reason = StopReason.out_of_tokens
+                    break
+
+                text = chunk["message"]["content"]
+
+                # check if its a tool call ( aka starts with <|python_tag|> )
+                if not ipython and text.startswith("<|python_tag|>"):
+                    ipython = True
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.started,
+                            ),
+                        )
+                    )
+                    buffer += text
+                    continue
+
+                if ipython:
+                    if text == "<|eot_id|>":
+                        stop_reason = StopReason.end_of_turn
+                        text = ""
+                        continue
+                    elif text == "<|eom_id|>":
+                        stop_reason = StopReason.end_of_message
+                        text = ""
+                        continue
+
+                    buffer += text
+                    delta = ToolCallDelta(
+                        content=text,
+                        parse_status=ToolCallParseStatus.in_progress,
+                    )
+
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=delta,
+                            stop_reason=stop_reason,
+                        )
+                    )
+                else:
+                    buffer += text
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=text,
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+            # parse tool calls and report errors
+            message = self.formatter.decode_assistant_message_from_content(
+                buffer, stop_reason
+            )
+            parsed_tool_calls = len(message.tool_calls) > 0
+            if ipython and not parsed_tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content="",
+                            parse_status=ToolCallParseStatus.failure,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            for tool_call in message.tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content=tool_call,
+                            parse_status=ToolCallParseStatus.success,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.complete,
+                    delta="",
+                    stop_reason=stop_reason,
+                )
+            )
--- a/llama_stack/inference/adapters/tgi/init.py
+++ b/llama_stack/inference/adapters/tgi/init.py
@ -0,0 +1,24 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import TGIImplConfig
+from .tgi import InferenceEndpointAdapter, TGIAdapter
+
+
+async def get_adapter_impl(config: TGIImplConfig, _deps):
+    assert isinstance(config, TGIImplConfig), f"Unexpected config type: {type(config)}"
+
+    if config.url is not None:
+        impl = TGIAdapter(config)
+    elif config.is_inference_endpoint():
+        impl = InferenceEndpointAdapter(config)
+    else:
+        raise ValueError(
+            "Invalid configuration. Specify either an URL or HF Inference Endpoint details (namespace and endpoint name)."
+        )
+
+    await impl.initialize()
+    return impl
--- a/llama_stack/inference/adapters/tgi/config.py
+++ b/llama_stack/inference/adapters/tgi/config.py
@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Optional
+
+from llama_models.schema_utils import json_schema_type
+from pydantic import BaseModel, Field
+
+
+@json_schema_type
+class TGIImplConfig(BaseModel):
+    url: Optional[str] = Field(
+        default=None,
+        description="The URL for the local TGI endpoint (e.g., http://localhost:8080)",
+    )
+    api_token: Optional[str] = Field(
+        default=None,
+        description="The HF token for Hugging Face Inference Endpoints (will default to locally saved token if not provided)",
+    )
+    hf_endpoint_name: Optional[str] = Field(
+        default=None,
+        description="The name of the Hugging Face Inference Endpoint : can be either in the format of '{namespace}/{endpoint_name}' (namespace can be the username or organization name) or just '{endpoint_name}' if logged into the same account as the namespace",
+    )
+
+    def is_inference_endpoint(self) -> bool:
+        return self.hf_endpoint_name is not None
--- a/llama_stack/inference/adapters/tgi/tgi.py
+++ b/llama_stack/inference/adapters/tgi/tgi.py
@ -0,0 +1,295 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from typing import Any, AsyncGenerator, Dict
+
+import requests
+
+from huggingface_hub import HfApi, InferenceClient
+from llama_models.llama3.api.chat_format import ChatFormat
+from llama_models.llama3.api.datatypes import StopReason
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_stack.inference.api import *  # noqa: F403
+from llama_stack.inference.prepare_messages import prepare_messages
+
+from .config import TGIImplConfig
+
+HF_SUPPORTED_MODELS = {
+    "Meta-Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "Meta-Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    "Meta-Llama3.1-405B-Instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+}
+
+
+class TGIAdapter(Inference):
+    def __init__(self, config: TGIImplConfig) -> None:
+        self.config = config
+        self.tokenizer = Tokenizer.get_instance()
+        self.formatter = ChatFormat(self.tokenizer)
+
+    @property
+    def client(self) -> InferenceClient:
+        return InferenceClient(model=self.config.url, token=self.config.api_token)
+
+    def _get_endpoint_info(self) -> Dict[str, Any]:
+        return {
+            **self.client.get_endpoint_info(),
+            "inference_url": self.config.url,
+        }
+
+    async def initialize(self) -> None:
+        try:
+            info = self._get_endpoint_info()
+            if "model_id" not in info:
+                raise RuntimeError("Missing model_id in model info")
+            if "max_total_tokens" not in info:
+                raise RuntimeError("Missing max_total_tokens in model info")
+            self.max_tokens = info["max_total_tokens"]
+
+            model_id = info["model_id"]
+            model_name = next(
+                (name for name, id in HF_SUPPORTED_MODELS.items() if id == model_id),
+                None,
+            )
+            if model_name is None:
+                raise RuntimeError(
+                    f"TGI is serving model: {model_id}, use one of the supported models: {', '.join(HF_SUPPORTED_MODELS.values())}"
+                )
+            self.model_name = model_name
+            self.inference_url = info["inference_url"]
+        except Exception as e:
+            import traceback
+
+            traceback.print_exc()
+            raise RuntimeError(f"Error initializing TGIAdapter: {e}") from e
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
+        raise NotImplementedError()
+
+    def get_chat_options(self, request: ChatCompletionRequest) -> dict:
+        options = {}
+        if request.sampling_params is not None:
+            for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
+                if getattr(request.sampling_params, attr):
+                    options[attr] = getattr(request.sampling_params, attr)
+
+        return options
+
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = list(),
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        messages = prepare_messages(request)
+        model_input = self.formatter.encode_dialog_prompt(messages)
+        prompt = self.tokenizer.decode(model_input.tokens)
+
+        input_tokens = len(model_input.tokens)
+        max_new_tokens = min(
+            request.sampling_params.max_tokens or (self.max_tokens - input_tokens),
+            self.max_tokens - input_tokens - 1,
+        )
+
+        print(f"Calculated max_new_tokens: {max_new_tokens}")
+
+        assert (
+            request.model == self.model_name
+        ), f"Model mismatch, expected {self.model_name}, got {request.model}"
+
+        options = self.get_chat_options(request)
+        if not request.stream:
+            response = self.client.text_generation(
+                prompt=prompt,
+                stream=False,
+                details=True,
+                max_new_tokens=max_new_tokens,
+                stop_sequences=["<|eom_id|>", "<|eot_id|>"],
+                **options,
+            )
+            stop_reason = None
+            if response.details.finish_reason:
+                if response.details.finish_reason == "stop":
+                    stop_reason = StopReason.end_of_turn
+                elif response.details.finish_reason == "length":
+                    stop_reason = StopReason.out_of_tokens
+
+            completion_message = self.formatter.decode_assistant_message_from_content(
+                response.generated_text,
+                stop_reason,
+            )
+            yield ChatCompletionResponse(
+                completion_message=completion_message,
+                logprobs=None,
+            )
+
+        else:
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.start,
+                    delta="",
+                )
+            )
+            buffer = ""
+            ipython = False
+            stop_reason = None
+            tokens = []
+
+            for response in self.client.text_generation(
+                prompt=prompt,
+                stream=True,
+                details=True,
+                max_new_tokens=max_new_tokens,
+                stop_sequences=["<|eom_id|>", "<|eot_id|>"],
+                **options,
+            ):
+                token_result = response.token
+
+                buffer += token_result.text
+                tokens.append(token_result.id)
+
+                if not ipython and buffer.startswith("<|python_tag|>"):
+                    ipython = True
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.started,
+                            ),
+                        )
+                    )
+                    buffer = buffer[len("<|python_tag|>") :]
+                    continue
+
+                if token_result.text == "<|eot_id|>":
+                    stop_reason = StopReason.end_of_turn
+                    text = ""
+                elif token_result.text == "<|eom_id|>":
+                    stop_reason = StopReason.end_of_message
+                    text = ""
+                else:
+                    text = token_result.text
+
+                if ipython:
+                    delta = ToolCallDelta(
+                        content=text,
+                        parse_status=ToolCallParseStatus.in_progress,
+                    )
+                else:
+                    delta = text
+
+                if stop_reason is None:
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=delta,
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+            if stop_reason is None:
+                stop_reason = StopReason.out_of_tokens
+
+            # parse tool calls and report errors
+            message = self.formatter.decode_assistant_message(tokens, stop_reason)
+            parsed_tool_calls = len(message.tool_calls) > 0
+            if ipython and not parsed_tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content="",
+                            parse_status=ToolCallParseStatus.failure,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            for tool_call in message.tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content=tool_call,
+                            parse_status=ToolCallParseStatus.success,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.complete,
+                    delta="",
+                    stop_reason=stop_reason,
+                )
+            )
+
+
+class InferenceEndpointAdapter(TGIAdapter):
+    def __init__(self, config: TGIImplConfig) -> None:
+        super().__init__(config)
+        self.config.url = self._construct_endpoint_url()
+
+    def _construct_endpoint_url(self) -> str:
+        hf_endpoint_name = self.config.hf_endpoint_name
+        assert hf_endpoint_name.count("/") <= 1, (
+            "Endpoint name must be in the format of 'namespace/endpoint_name' "
+            "or 'endpoint_name'"
+        )
+        if "/" not in hf_endpoint_name:
+            hf_namespace: str = self.get_namespace()
+            endpoint_path = f"{hf_namespace}/{hf_endpoint_name}"
+        else:
+            endpoint_path = hf_endpoint_name
+        return f"https://api.endpoints.huggingface.cloud/v2/endpoint/{endpoint_path}"
+
+    def get_namespace(self) -> str:
+        return HfApi().whoami()["name"]
+
+    @property
+    def client(self) -> InferenceClient:
+        return InferenceClient(model=self.inference_url, token=self.config.api_token)
+
+    def _get_endpoint_info(self) -> Dict[str, Any]:
+        headers = {
+            "accept": "application/json",
+            "authorization": f"Bearer {self.config.api_token}",
+        }
+        response = requests.get(self.config.url, headers=headers)
+        response.raise_for_status()
+        endpoint_info = response.json()
+        return {
+            "inference_url": endpoint_info["status"]["url"],
+            "model_id": endpoint_info["model"]["repository"],
+            "max_total_tokens": int(
+                endpoint_info["model"]["image"]["custom"]["env"]["MAX_TOTAL_TOKENS"]
+            ),
+        }
+
+    async def initialize(self) -> None:
+        await super().initialize()
--- a/llama_stack/inference/adapters/together/init.py
+++ b/llama_stack/inference/adapters/together/init.py
@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import TogetherImplConfig
+
+
+async def get_adapter_impl(config: TogetherImplConfig, _deps):
+    from .together import TogetherInferenceAdapter
+
+    assert isinstance(
+        config, TogetherImplConfig
+    ), f"Unexpected config type: {type(config)}"
+    impl = TogetherInferenceAdapter(config)
+    await impl.initialize()
+    return impl
--- a/llama_stack/inference/adapters/together/config.py
+++ b/llama_stack/inference/adapters/together/config.py
@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_models.schema_utils import json_schema_type
+from pydantic import BaseModel, Field
+
+
+@json_schema_type
+class TogetherImplConfig(BaseModel):
+    url: str = Field(
+        default="https://api.together.xyz/v1",
+        description="The URL for the Together AI server",
+    )
+    api_key: str = Field(
+        default="",
+        description="The Together AI API Key",
+    )
--- a/llama_stack/inference/adapters/together/together.py
+++ b/llama_stack/inference/adapters/together/together.py
@ -0,0 +1,251 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import AsyncGenerator
+
+from llama_models.llama3.api.chat_format import ChatFormat
+
+from llama_models.llama3.api.datatypes import Message, StopReason
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.sku_list import resolve_model
+from together import Together
+
+from llama_stack.inference.api import *  # noqa: F403
+from llama_stack.inference.prepare_messages import prepare_messages
+
+from .config import TogetherImplConfig
+
+TOGETHER_SUPPORTED_MODELS = {
+    "Meta-Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+    "Meta-Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+    "Meta-Llama3.1-405B-Instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+}
+
+
+class TogetherInferenceAdapter(Inference):
+    def __init__(self, config: TogetherImplConfig) -> None:
+        self.config = config
+        tokenizer = Tokenizer.get_instance()
+        self.formatter = ChatFormat(tokenizer)
+
+    @property
+    def client(self) -> Together:
+        return Together(api_key=self.config.api_key)
+
+    async def initialize(self) -> None:
+        return
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
+        raise NotImplementedError()
+
+    def _messages_to_together_messages(self, messages: list[Message]) -> list:
+        together_messages = []
+        for message in messages:
+            if message.role == "ipython":
+                role = "tool"
+            else:
+                role = message.role
+            together_messages.append({"role": role, "content": message.content})
+
+        return together_messages
+
+    def resolve_together_model(self, model_name: str) -> str:
+        model = resolve_model(model_name)
+        assert (
+            model is not None
+            and model.descriptor(shorten_default_variant=True)
+            in TOGETHER_SUPPORTED_MODELS
+        ), f"Unsupported model: {model_name}, use one of the supported models: {','.join(TOGETHER_SUPPORTED_MODELS.keys())}"
+
+        return TOGETHER_SUPPORTED_MODELS.get(
+            model.descriptor(shorten_default_variant=True)
+        )
+
+    def get_together_chat_options(self, request: ChatCompletionRequest) -> dict:
+        options = {}
+        if request.sampling_params is not None:
+            for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
+                if getattr(request.sampling_params, attr):
+                    options[attr] = getattr(request.sampling_params, attr)
+
+        return options
+
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = list(),
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        # wrapper request to make it easier to pass around (internal only, not exposed to API)
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        # accumulate sampling params and other options to pass to together
+        options = self.get_together_chat_options(request)
+        together_model = self.resolve_together_model(request.model)
+        messages = prepare_messages(request)
+
+        if not request.stream:
+            # TODO: might need to add back an async here
+            r = self.client.chat.completions.create(
+                model=together_model,
+                messages=self._messages_to_together_messages(messages),
+                stream=False,
+                **options,
+            )
+            stop_reason = None
+            if r.choices[0].finish_reason:
+                if (
+                    r.choices[0].finish_reason == "stop"
+                    or r.choices[0].finish_reason == "eos"
+                ):
+                    stop_reason = StopReason.end_of_turn
+                elif r.choices[0].finish_reason == "length":
+                    stop_reason = StopReason.out_of_tokens
+
+            completion_message = self.formatter.decode_assistant_message_from_content(
+                r.choices[0].message.content, stop_reason
+            )
+            yield ChatCompletionResponse(
+                completion_message=completion_message,
+                logprobs=None,
+            )
+        else:
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.start,
+                    delta="",
+                )
+            )
+
+            buffer = ""
+            ipython = False
+            stop_reason = None
+
+            for chunk in self.client.chat.completions.create(
+                model=together_model,
+                messages=self._messages_to_together_messages(messages),
+                stream=True,
+                **options,
+            ):
+                if chunk.choices[0].finish_reason:
+                    if (
+                        stop_reason is None and chunk.choices[0].finish_reason == "stop"
+                    ) or (
+                        stop_reason is None and chunk.choices[0].finish_reason == "eos"
+                    ):
+                        stop_reason = StopReason.end_of_turn
+                    elif (
+                        stop_reason is None
+                        and chunk.choices[0].finish_reason == "length"
+                    ):
+                        stop_reason = StopReason.out_of_tokens
+                    break
+
+                text = chunk.choices[0].delta.content
+                if text is None:
+                    continue
+
+                # check if its a tool call ( aka starts with <|python_tag|> )
+                if not ipython and text.startswith("<|python_tag|>"):
+                    ipython = True
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.started,
+                            ),
+                        )
+                    )
+                    buffer += text
+                    continue
+
+                if ipython:
+                    if text == "<|eot_id|>":
+                        stop_reason = StopReason.end_of_turn
+                        text = ""
+                        continue
+                    elif text == "<|eom_id|>":
+                        stop_reason = StopReason.end_of_message
+                        text = ""
+                        continue
+
+                    buffer += text
+                    delta = ToolCallDelta(
+                        content=text,
+                        parse_status=ToolCallParseStatus.in_progress,
+                    )
+
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=delta,
+                            stop_reason=stop_reason,
+                        )
+                    )
+                else:
+                    buffer += text
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=text,
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+            # parse tool calls and report errors
+            message = self.formatter.decode_assistant_message_from_content(
+                buffer, stop_reason
+            )
+            parsed_tool_calls = len(message.tool_calls) > 0
+            if ipython and not parsed_tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content="",
+                            parse_status=ToolCallParseStatus.failure,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            for tool_call in message.tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content=tool_call,
+                            parse_status=ToolCallParseStatus.success,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.complete,
+                    delta="",
+                    stop_reason=stop_reason,
+                )
+            )
--- a/llama_stack/inference/api/init.py
+++ b/llama_stack/inference/api/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .api import *  # noqa: F401 F403
--- a/llama_stack/inference/api/api.py
+++ b/llama_stack/inference/api/api.py
@ -0,0 +1,205 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from enum import Enum
+
+from typing import List, Literal, Optional, Protocol, Union
+
+from llama_models.schema_utils import json_schema_type, webmethod
+
+from pydantic import BaseModel, Field
+from typing_extensions import Annotated
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+
+
+class LogProbConfig(BaseModel):
+    top_k: Optional[int] = 0
+
+
+@json_schema_type
+class QuantizationType(Enum):
+    bf16 = "bf16"
+    fp8 = "fp8"
+
+
+@json_schema_type
+class Fp8QuantizationConfig(BaseModel):
+    type: Literal[QuantizationType.fp8.value] = QuantizationType.fp8.value
+
+
+@json_schema_type
+class Bf16QuantizationConfig(BaseModel):
+    type: Literal[QuantizationType.bf16.value] = QuantizationType.bf16.value
+
+
+QuantizationConfig = Annotated[
+    Union[Bf16QuantizationConfig, Fp8QuantizationConfig],
+    Field(discriminator="type"),
+]
+
+
+@json_schema_type
+class ChatCompletionResponseEventType(Enum):
+    start = "start"
+    complete = "complete"
+    progress = "progress"
+
+
+@json_schema_type
+class ToolCallParseStatus(Enum):
+    started = "started"
+    in_progress = "in_progress"
+    failure = "failure"
+    success = "success"
+
+
+@json_schema_type
+class ToolCallDelta(BaseModel):
+    content: Union[str, ToolCall]
+    parse_status: ToolCallParseStatus
+
+
+@json_schema_type
+class ChatCompletionResponseEvent(BaseModel):
+    """Chat completion response event."""
+
+    event_type: ChatCompletionResponseEventType
+    delta: Union[str, ToolCallDelta]
+    logprobs: Optional[List[TokenLogProbs]] = None
+    stop_reason: Optional[StopReason] = None
+
+
+@json_schema_type
+class CompletionRequest(BaseModel):
+    model: str
+    content: InterleavedTextMedia
+    sampling_params: Optional[SamplingParams] = SamplingParams()
+
+    stream: Optional[bool] = False
+    logprobs: Optional[LogProbConfig] = None
+
+
+@json_schema_type
+class CompletionResponse(BaseModel):
+    """Completion response."""
+
+    completion_message: CompletionMessage
+    logprobs: Optional[List[TokenLogProbs]] = None
+
+
+@json_schema_type
+class CompletionResponseStreamChunk(BaseModel):
+    """streamed completion response."""
+
+    delta: str
+    stop_reason: Optional[StopReason] = None
+    logprobs: Optional[List[TokenLogProbs]] = None
+
+
+@json_schema_type
+class BatchCompletionRequest(BaseModel):
+    model: str
+    content_batch: List[InterleavedTextMedia]
+    sampling_params: Optional[SamplingParams] = SamplingParams()
+    logprobs: Optional[LogProbConfig] = None
+
+
+@json_schema_type
+class BatchCompletionResponse(BaseModel):
+    """Batch completion response."""
+
+    completion_message_batch: List[CompletionMessage]
+
+
+@json_schema_type
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[Message]
+    sampling_params: Optional[SamplingParams] = SamplingParams()
+
+    # zero-shot tool definitions as input to the model
+    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
+    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
+    tool_prompt_format: Optional[ToolPromptFormat] = Field(
+        default=ToolPromptFormat.json
+    )
+
+    stream: Optional[bool] = False
+    logprobs: Optional[LogProbConfig] = None
+
+
+@json_schema_type
+class ChatCompletionResponseStreamChunk(BaseModel):
+    """SSE-stream of these events."""
+
+    event: ChatCompletionResponseEvent
+
+
+@json_schema_type
+class ChatCompletionResponse(BaseModel):
+    """Chat completion response."""
+
+    completion_message: CompletionMessage
+    logprobs: Optional[List[TokenLogProbs]] = None
+
+
+@json_schema_type
+class BatchChatCompletionRequest(BaseModel):
+    model: str
+    messages_batch: List[List[Message]]
+    sampling_params: Optional[SamplingParams] = SamplingParams()
+
+    # zero-shot tool definitions as input to the model
+    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
+    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
+    tool_prompt_format: Optional[ToolPromptFormat] = Field(
+        default=ToolPromptFormat.json
+    )
+    logprobs: Optional[LogProbConfig] = None
+
+
+@json_schema_type
+class BatchChatCompletionResponse(BaseModel):
+    completion_message_batch: List[CompletionMessage]
+
+
+@json_schema_type
+class EmbeddingsResponse(BaseModel):
+    embeddings: List[List[float]]
+
+
+class Inference(Protocol):
+    @webmethod(route="/inference/completion")
+    async def completion(
+        self,
+        model: str,
+        content: InterleavedTextMedia,
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]: ...
+
+    @webmethod(route="/inference/chat_completion")
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        # zero-shot tool definitions as input to the model
+        tools: Optional[List[ToolDefinition]] = list,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]: ...
+
+    @webmethod(route="/inference/embeddings")
+    async def embeddings(
+        self,
+        model: str,
+        contents: List[InterleavedTextMedia],
+    ) -> EmbeddingsResponse: ...
--- a/llama_stack/inference/client.py
+++ b/llama_stack/inference/client.py
@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import json
+from typing import Any, AsyncGenerator
+
+import fire
+import httpx
+from pydantic import BaseModel
+from termcolor import cprint
+
+from llama_stack.core.datatypes import RemoteProviderConfig
+
+from .api import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionResponseStreamChunk,
+    CompletionRequest,
+    Inference,
+    UserMessage,
+)
+from .event_logger import EventLogger
+
+
+async def get_client_impl(config: RemoteProviderConfig, _deps: Any) -> Inference:
+    return InferenceClient(config.url)
+
+
+def encodable_dict(d: BaseModel):
+    return json.loads(d.json())
+
+
+class InferenceClient(Inference):
+    def __init__(self, base_url: str):
+        self.base_url = base_url
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
+        raise NotImplementedError()
+
+    async def chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
+        async with httpx.AsyncClient() as client:
+            async with client.stream(
+                "POST",
+                f"{self.base_url}/inference/chat_completion",
+                json=encodable_dict(request),
+                headers={"Content-Type": "application/json"},
+                timeout=20,
+            ) as response:
+                if response.status_code != 200:
+                    content = await response.aread()
+                    cprint(
+                        f"Error: HTTP {response.status_code} {content.decode()}", "red"
+                    )
+                    return
+
+                async for line in response.aiter_lines():
+                    if line.startswith("data:"):
+                        data = line[len("data: ") :]
+                        try:
+                            if request.stream:
+                                if "error" in data:
+                                    cprint(data, "red")
+                                    continue
+
+                                yield ChatCompletionResponseStreamChunk(
+                                    **json.loads(data)
+                                )
+                            else:
+                                yield ChatCompletionResponse(**json.loads(data))
+                        except Exception as e:
+                            print(data)
+                            print(f"Error with parsing or validation: {e}")
+
+
+async def run_main(host: str, port: int, stream: bool):
+    client = InferenceClient(f"http://{host}:{port}")
+
+    message = UserMessage(content="hello world, troll me in two-paragraphs about 42")
+    cprint(f"User>{message.content}", "green")
+    iterator = client.chat_completion(
+        ChatCompletionRequest(
+            model="Meta-Llama3.1-8B-Instruct",
+            messages=[message],
+            stream=stream,
+        )
+    )
+    async for log in EventLogger().log(iterator):
+        log.print()
+
+
+def main(host: str, port: int, stream: bool = True):
+    asyncio.run(run_main(host, port, stream))
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/llama_stack/inference/event_logger.py
+++ b/llama_stack/inference/event_logger.py
@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from termcolor import cprint
+
+from llama_stack.inference.api import (
+    ChatCompletionResponseEventType,
+    ChatCompletionResponseStreamChunk,
+)
+
+
+class LogEvent:
+    def __init__(
+        self,
+        content: str = "",
+        end: str = "\n",
+        color="white",
+    ):
+        self.content = content
+        self.color = color
+        self.end = "\n" if end is None else end
+
+    def print(self, flush=True):
+        cprint(f"{self.content}", color=self.color, end=self.end, flush=flush)
+
+
+class EventLogger:
+    async def log(self, event_generator):
+        async for chunk in event_generator:
+            if isinstance(chunk, ChatCompletionResponseStreamChunk):
+                event = chunk.event
+                if event.event_type == ChatCompletionResponseEventType.start:
+                    yield LogEvent("Assistant> ", color="cyan", end="")
+                elif event.event_type == ChatCompletionResponseEventType.progress:
+                    yield LogEvent(event.delta, color="yellow", end="")
+                elif event.event_type == ChatCompletionResponseEventType.complete:
+                    yield LogEvent("")
+            else:
+                yield LogEvent("Assistant> ", color="cyan", end="")
+                yield LogEvent(chunk.completion_message.content, color="yellow")
--- a/llama_stack/inference/meta_reference/init.py
+++ b/llama_stack/inference/meta_reference/init.py
@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import MetaReferenceImplConfig  # noqa
+
+
+async def get_provider_impl(config: MetaReferenceImplConfig, _deps):
+    from .inference import MetaReferenceInferenceImpl
+
+    assert isinstance(
+        config, MetaReferenceImplConfig
+    ), f"Unexpected config type: {type(config)}"
+
+    impl = MetaReferenceInferenceImpl(config)
+    await impl.initialize()
+    return impl
--- a/llama_stack/inference/meta_reference/config.py
+++ b/llama_stack/inference/meta_reference/config.py
@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Optional
+
+from llama_models.datatypes import ModelFamily
+
+from llama_models.schema_utils import json_schema_type
+from llama_models.sku_list import all_registered_models, resolve_model
+
+from pydantic import BaseModel, Field, field_validator
+
+from llama_stack.inference.api import QuantizationConfig
+
+
+@json_schema_type
+class MetaReferenceImplConfig(BaseModel):
+    model: str = Field(
+        default="Meta-Llama3.1-8B-Instruct",
+        description="Model descriptor from `llama model list`",
+    )
+    quantization: Optional[QuantizationConfig] = None
+    torch_seed: Optional[int] = None
+    max_seq_len: int
+    max_batch_size: int = 1
+
+    @field_validator("model")
+    @classmethod
+    def validate_model(cls, model: str) -> str:
+        permitted_models = [
+            m.descriptor()
+            for m in all_registered_models()
+            if m.model_family == ModelFamily.llama3_1
+        ]
+        if model not in permitted_models:
+            model_list = "\n\t".join(permitted_models)
+            raise ValueError(
+                f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]"
+            )
+        return model
+
+    @property
+    def model_parallel_size(self) -> int:
+        # HUGE HACK ALERT: this will be fixed when we move inference configuration
+        # to ModelsRegistry and we can explicitly ask for `model_parallel_size`
+        # as configuration there
+        gpu_count = 1
+        resolved = resolve_model(self.model)
+        assert resolved is not None
+        descriptor = resolved.descriptor().lower()
+        if "-70b" in descriptor or "-405b" in descriptor:
+            gpu_count = 8
+
+        return gpu_count
--- a/llama_stack/inference/meta_reference/generation.py
+++ b/llama_stack/inference/meta_reference/generation.py
@ -0,0 +1,327 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import json
+import os
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Generator, List, Optional
+
+import torch
+import torch.nn.functional as F
+from fairscale.nn.model_parallel.initialize import (
+    get_model_parallel_rank,
+    initialize_model_parallel,
+    model_parallel_is_initialized,
+)
+from llama_models.llama3.api.args import ModelArgs
+from llama_models.llama3.api.chat_format import ChatFormat, ModelInput
+from llama_models.llama3.api.datatypes import Message, ToolPromptFormat
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.llama3.reference_impl.model import Transformer
+from llama_models.sku_list import resolve_model
+from termcolor import cprint
+
+from llama_stack.common.model_utils import model_local_dir
+from llama_stack.inference.api import QuantizationType
+
+from .config import MetaReferenceImplConfig
+
+
+def model_checkpoint_dir(model) -> str:
+    checkpoint_dir = Path(model_local_dir(model.descriptor()))
+    if not Path(checkpoint_dir / "consolidated.00.pth").exists():
+        checkpoint_dir = checkpoint_dir / "original"
+
+    assert checkpoint_dir.exists(), (
+        f"Could not find checkpoint dir: {checkpoint_dir}."
+        f"Please download model using `llama download {model.descriptor()}`"
+    )
+    return str(checkpoint_dir)
+
+
+@dataclass
+class TokenResult:
+    token: int
+    text: str
+    logprobs: Optional[List[float]] = None
+
+
+class Llama:
+    @staticmethod
+    def build(config: MetaReferenceImplConfig):
+        """
+        Build a Llama instance by initializing and loading a model checkpoint.
+
+        Note:
+            This method initializes the distributed process group, sets the device to CUDA,
+            and loads the pre-trained model and tokenizer.
+        """
+        model = resolve_model(config.model)
+
+        if (
+            config.quantization
+            and config.quantization.type == QuantizationType.fp8.value
+        ):
+            from .quantization.loader import is_fbgemm_available
+
+            if not is_fbgemm_available():
+                raise ImportError("fbgemm-gpu is required for FP8 quantization")
+
+        if not torch.distributed.is_initialized():
+            torch.distributed.init_process_group("nccl")
+
+        model_parallel_size = config.model_parallel_size
+
+        if not model_parallel_is_initialized():
+            initialize_model_parallel(model_parallel_size)
+
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        torch.cuda.set_device(local_rank)
+
+        # seed must be the same in all processes
+        if config.torch_seed is not None:
+            torch.manual_seed(config.torch_seed)
+
+        if local_rank > 0:
+            sys.stdout = open(os.devnull, "w")
+
+        start_time = time.time()
+        ckpt_dir = model_checkpoint_dir(model)
+
+        checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
+        assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
+        assert model_parallel_size == len(
+            checkpoints
+        ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
+        ckpt_path = checkpoints[get_model_parallel_rank()]
+        state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        with open(Path(ckpt_dir) / "params.json", "r") as f:
+            params = json.loads(f.read())
+
+        if "model" in params:
+            params = params["model"]
+
+        model_args: ModelArgs = ModelArgs(
+            max_seq_len=config.max_seq_len,
+            max_batch_size=config.max_batch_size,
+            **params,
+        )
+
+        tokenizer_path = os.path.join(ckpt_dir, "tokenizer.model")
+        tokenizer = Tokenizer(model_path=tokenizer_path)
+
+        assert (
+            model_args.vocab_size == tokenizer.n_words
+        ), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
+
+        fp8 = (
+            config.quantization
+            and config.quantization.type == QuantizationType.fp8.value
+        )
+
+        if fp8:
+            from .quantization.loader import convert_to_quantized_model
+
+            # load on CPU in bf16 so that fp8 conversion does not find an
+            # unexpected (fp32, e.g.) datatype
+            torch.set_default_tensor_type(torch.BFloat16Tensor)
+            model = Transformer(model_args)
+            model.load_state_dict(state_dict, strict=False)
+            model = convert_to_quantized_model(model, config)
+        else:
+            if torch.cuda.is_bf16_supported():
+                torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
+            else:
+                torch.set_default_tensor_type(torch.cuda.HalfTensor)
+            model = Transformer(model_args)
+            model.load_state_dict(state_dict, strict=False)
+
+        print(f"Loaded in {time.time() - start_time:.2f} seconds")
+        return Llama(model, tokenizer, model_args)
+
+    def __init__(self, model: Transformer, tokenizer: Tokenizer, args: ModelArgs):
+        self.args = args
+        self.model = model
+        self.tokenizer = tokenizer
+        self.formatter = ChatFormat(tokenizer)
+
+    @torch.inference_mode()
+    def generate(
+        self,
+        model_input: ModelInput,
+        max_gen_len: int,
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        logprobs: bool = False,
+        echo: bool = False,
+        include_stop_token: bool = False,
+    ) -> Generator:
+        params = self.model.params
+
+        # cprint("Input to model -> " + self.tokenizer.decode(model_input.tokens), "red")
+        prompt_tokens = [model_input.tokens]
+
+        bsz = 1
+        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
+
+        min_prompt_len = min(len(t) for t in prompt_tokens)
+        max_prompt_len = max(len(t) for t in prompt_tokens)
+
+        if max_prompt_len >= params.max_seq_len:
+            cprint(
+                f"Out of token budget {max_prompt_len} vs {params.max_seq_len}", "red"
+            )
+            return
+
+        total_len = min(max_gen_len + max_prompt_len, params.max_seq_len)
+        pad_id = self.tokenizer.pad_id
+        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
+        for k, t in enumerate(prompt_tokens):
+            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device="cuda")
+        if logprobs:
+            token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
+
+        prev_pos = 0
+        eos_reached = torch.tensor([False] * bsz, device="cuda")
+        input_text_mask = tokens != pad_id
+        if min_prompt_len == total_len:
+            # TODO(ashwin): unify this branch with the one below and figure out multimodal crap
+            logits = self.model.forward(tokens, prev_pos)
+            token_logprobs = -F.cross_entropy(
+                input=logits.transpose(1, 2),
+                target=tokens,
+                reduction="none",
+                ignore_index=pad_id,
+            )
+
+        stop_tokens = torch.tensor(self.tokenizer.stop_tokens)
+
+        for cur_pos in range(min_prompt_len, total_len):
+            logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
+
+            if temperature > 0:
+                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
+                next_token = sample_top_p(probs, top_p)
+            else:
+                next_token = torch.argmax(logits[:, -1], dim=-1)
+
+            next_token = next_token.reshape(-1)
+            # only replace token if prompt has already been generated
+            next_token = torch.where(
+                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
+            )
+            tokens[:, cur_pos] = next_token
+
+            target = tokens[:, prev_pos + 1 : cur_pos + 1]
+            if logprobs:
+                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
+                    input=logits.transpose(1, 2),
+                    target=tokens[:, prev_pos + 1 : cur_pos + 1],
+                    reduction="none",
+                    ignore_index=pad_id,
+                )
+            eos_reached |= (~input_text_mask[:, cur_pos]) & (
+                torch.isin(next_token, stop_tokens)
+            )
+            yield TokenResult(
+                token=next_token[0].item(),
+                text=self.tokenizer.decode(next_token.tolist()),
+                logprobs=(
+                    token_logprobs[:, prev_pos + 1 : cur_pos + 1][0].tolist()
+                    if logprobs
+                    else None
+                ),
+            )
+
+            prev_pos = cur_pos
+            if all(eos_reached):
+                break
+
+    def text_completion(
+        self,
+        prompt: str,
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_gen_len: Optional[int] = None,
+        logprobs: bool = False,
+        echo: bool = False,
+    ) -> Generator:
+        if (
+            max_gen_len is None
+            or max_gen_len == 0
+            or max_gen_len >= self.model.params.max_seq_len
+        ):
+            max_gen_len = self.model.params.max_seq_len - 1
+
+        prompt_tokens = self.tokenizer.encode(prompt, bos=True, eos=False)
+
+        yield from self.generate(
+            model_input=ModelInput(tokens=prompt_tokens),
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=logprobs,
+            echo=echo,
+        )
+
+    def chat_completion(
+        self,
+        messages: List[Message],
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_gen_len: Optional[int] = None,
+        logprobs: bool = False,
+        tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
+    ) -> Generator:
+        if (
+            max_gen_len is None
+            or max_gen_len == 0
+            or max_gen_len >= self.model.params.max_seq_len
+        ):
+            max_gen_len = self.model.params.max_seq_len - 1
+
+        yield from self.generate(
+            model_input=self.formatter.encode_dialog_prompt(
+                messages,
+                tool_prompt_format,
+            ),
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=logprobs,
+            include_stop_token=True,
+        )
+
+
+def sample_top_p(probs, p):
+    """
+    Perform top-p (nucleus) sampling on a probability distribution.
+
+    Args:
+        probs (torch.Tensor): Probability distribution tensor.
+        p (float): Probability threshold for top-p sampling.
+
+    Returns:
+        torch.Tensor: Sampled token indices.
+
+    Note:
+        Top-p sampling selects the smallest set of tokens whose cumulative probability mass
+        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
+    """
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
--- a/llama_stack/inference/meta_reference/inference.py
+++ b/llama_stack/inference/meta_reference/inference.py
@ -0,0 +1,215 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+
+from typing import AsyncIterator, Union
+
+from llama_models.llama3.api.datatypes import StopReason
+from llama_models.sku_list import resolve_model
+
+from llama_stack.inference.api import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionResponseEvent,
+    ChatCompletionResponseEventType,
+    ChatCompletionResponseStreamChunk,
+    Inference,
+    ToolCallDelta,
+    ToolCallParseStatus,
+)
+from llama_stack.inference.prepare_messages import prepare_messages
+
+from .config import MetaReferenceImplConfig
+from .model_parallel import LlamaModelParallelGenerator
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.inference.api import *  # noqa: F403
+
+# there's a single model parallel process running serving the model. for now,
+# we don't support multiple concurrent requests to this process.
+SEMAPHORE = asyncio.Semaphore(1)
+
+
+class MetaReferenceInferenceImpl(Inference):
+    def __init__(self, config: MetaReferenceImplConfig) -> None:
+        self.config = config
+        model = resolve_model(config.model)
+        if model is None:
+            raise RuntimeError(f"Unknown model: {config.model}, Run `llama model list`")
+        self.model = model
+        # verify that the checkpoint actually is for this model lol
+
+    async def initialize(self) -> None:
+        self.generator = LlamaModelParallelGenerator(self.config)
+        self.generator.start()
+
+    async def shutdown(self) -> None:
+        self.generator.stop()
+
+    # hm, when stream=False, we should not be doing SSE :/ which is what the
+    # top-level server is going to do. make the typing more specific here
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = list(),
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncIterator[
+        Union[ChatCompletionResponseStreamChunk, ChatCompletionResponse]
+    ]:
+        # wrapper request to make it easier to pass around (internal only, not exposed to API)
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        messages = prepare_messages(request)
+        model = resolve_model(request.model)
+        if model is None:
+            raise RuntimeError(
+                f"Unknown model: {request.model}, Run `llama model list`"
+            )
+        elif model.descriptor() != self.model.descriptor():
+            raise RuntimeError(
+                f"Model mismatch: {request.model} != {self.model.descriptor()}"
+            )
+
+        if SEMAPHORE.locked():
+            raise RuntimeError("Only one concurrent request is supported")
+
+        async with SEMAPHORE:
+            if request.stream:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.start,
+                        delta="",
+                    )
+                )
+
+            tokens = []
+            logprobs = []
+
+            stop_reason = None
+
+            buffer = ""
+            ipython = False
+
+            for token_result in self.generator.chat_completion(
+                messages=messages,
+                temperature=request.sampling_params.temperature,
+                top_p=request.sampling_params.top_p,
+                max_gen_len=request.sampling_params.max_tokens,
+                logprobs=request.logprobs,
+                tool_prompt_format=request.tool_prompt_format,
+            ):
+                buffer += token_result.text
+                tokens.append(token_result.token)
+
+                if not ipython and buffer.startswith("<|python_tag|>"):
+                    ipython = True
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.started,
+                            ),
+                        )
+                    )
+                    buffer = buffer[len("<|python_tag|>") :]
+                    continue
+
+                if not request.stream:
+                    if request.logprobs:
+                        logprobs.append(token_result.logprob)
+
+                    continue
+
+                if token_result.text == "<|eot_id|>":
+                    stop_reason = StopReason.end_of_turn
+                    text = ""
+                elif token_result.text == "<|eom_id|>":
+                    stop_reason = StopReason.end_of_message
+                    text = ""
+                else:
+                    text = token_result.text
+
+                if ipython:
+                    delta = ToolCallDelta(
+                        content=text,
+                        parse_status=ToolCallParseStatus.in_progress,
+                    )
+                else:
+                    delta = text
+
+                if stop_reason is None:
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=delta,
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+            if stop_reason is None:
+                stop_reason = StopReason.out_of_tokens
+
+            # TODO(ashwin): parse tool calls separately here and report errors?
+            # if someone breaks the iteration before coming here we are toast
+            message = self.generator.formatter.decode_assistant_message(
+                tokens, stop_reason
+            )
+            if request.stream:
+                parsed_tool_calls = len(message.tool_calls) > 0
+                if ipython and not parsed_tool_calls:
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.failure,
+                            ),
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+                for tool_call in message.tool_calls:
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content=tool_call,
+                                parse_status=ToolCallParseStatus.success,
+                            ),
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.complete,
+                        delta="",
+                        stop_reason=stop_reason,
+                    )
+                )
+
+                # TODO(ashwin): what else do we need to send out here when everything finishes?
+            else:
+                yield ChatCompletionResponse(
+                    completion_message=message,
+                    logprobs=logprobs if request.logprobs else None,
+                )
--- a/llama_stack/inference/meta_reference/model_parallel.py
+++ b/llama_stack/inference/meta_reference/model_parallel.py
@ -0,0 +1,110 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+from copy import deepcopy
+from dataclasses import dataclass
+from functools import partial
+from typing import Generator, List, Optional
+
+from llama_models.llama3.api.chat_format import ChatFormat
+from llama_models.llama3.api.datatypes import Message, ToolPromptFormat
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.sku_list import resolve_model
+
+from .config import MetaReferenceImplConfig
+from .generation import Llama, model_checkpoint_dir
+from .parallel_utils import ModelParallelProcessGroup
+
+
+@dataclass
+class InferenceArgs:
+    messages: List[Message]
+    temperature: float
+    top_p: float
+    max_gen_len: int
+    logprobs: bool
+    tool_prompt_format: ToolPromptFormat
+
+
+class ModelRunner:
+    def __init__(self, llama):
+        self.llama = llama
+
+    # the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()`
+    def __call__(self, task: InferenceArgs):
+        return self.llama.chat_completion(
+            task.messages,
+            task.temperature,
+            task.top_p,
+            task.max_gen_len,
+            task.logprobs,
+            task.tool_prompt_format,
+        )
+
+
+def init_model_cb(config: MetaReferenceImplConfig):
+    llama = Llama.build(config)
+    return ModelRunner(llama)
+
+
+class LlamaModelParallelGenerator:
+    """
+    This abstraction exists so
+     - we can run model parallel code without needing to run the CLIs via torchrun
+     - this also enables use model parallel code within a notebook context.
+
+    A Context Manager is used to ensure that the model parallel process is started and stopped
+    correctly. This does make the ergonomics a little awkward, because it isn't immediately
+    clear at the callsite why we need to use a context manager.
+    """
+
+    def __init__(self, config: MetaReferenceImplConfig):
+        self.config = config
+        self.model = resolve_model(self.config.model)
+        # this is a hack because Agent's loop uses this to tokenize and check if input is too long
+        # while the tool-use loop is going
+        checkpoint_dir = model_checkpoint_dir(self.model)
+        tokenizer_path = os.path.join(checkpoint_dir, "tokenizer.model")
+        self.formatter = ChatFormat(Tokenizer(tokenizer_path))
+
+    def start(self):
+        self.__enter__()
+
+    def stop(self):
+        self.__exit__(None, None, None)
+
+    def __enter__(self):
+        self.group = ModelParallelProcessGroup(
+            self.config.model_parallel_size,
+            init_model_cb=partial(init_model_cb, self.config),
+        )
+        self.group.start()
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.group.stop()
+
+    def chat_completion(
+        self,
+        messages: List[Message],
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_gen_len: Optional[int] = None,
+        logprobs: bool = False,
+        tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
+    ) -> Generator:
+        req_obj = InferenceArgs(
+            messages=deepcopy(messages),
+            temperature=temperature,
+            top_p=top_p,
+            max_gen_len=max_gen_len,
+            logprobs=logprobs,
+            tool_prompt_format=tool_prompt_format,
+        )
+
+        gen = self.group.run_inference(req_obj)
+        yield from gen
--- a/llama_stack/inference/meta_reference/parallel_utils.py
+++ b/llama_stack/inference/meta_reference/parallel_utils.py
@ -0,0 +1,265 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import multiprocessing
+import os
+import pickle
+import tempfile
+import time
+import uuid
+
+from typing import Callable, Generator
+
+import torch
+
+import zmq
+
+from fairscale.nn.model_parallel.initialize import (
+    get_model_parallel_group,
+    get_model_parallel_rank,
+    get_model_parallel_src_rank,
+)
+
+from torch.distributed.launcher.api import elastic_launch, LaunchConfig
+
+
+_END_SENTINEL = "__end_sentinel__"
+_CANCEL_SENTINEL = "__cancel_sentinel__"
+
+
+def mp_rank_0() -> bool:
+    return get_model_parallel_rank() == 0
+
+
+def retrieve_requests(reply_socket_url: str):
+    if mp_rank_0():
+        context = zmq.Context()
+        reply_socket = context.socket(zmq.ROUTER)
+        reply_socket.connect(reply_socket_url)
+
+        while True:
+            client_id, obj = maybe_get_work(reply_socket)
+            if obj is None:
+                time.sleep(0.01)
+                continue
+
+            reply_socket.send_multipart([client_id, pickle.dumps("YES READY")])
+            break
+
+    def send_obj(obj):
+        reply_socket.send_multipart([client_id, pickle.dumps(obj)])
+
+    while True:
+        tasks = [None]
+        if mp_rank_0():
+            client_id, task = maybe_get_work(reply_socket)
+            # there is still an unknown unclean GeneratorExit happening resulting in a
+            # cancel sentinel getting queued _after_ we have finished sending everything :/
+            # kind of a hack this is :/
+            if task != _CANCEL_SENTINEL:
+                tasks = [task]
+
+        torch.distributed.broadcast_object_list(
+            tasks,
+            src=get_model_parallel_src_rank(),
+            group=get_model_parallel_group(),
+        )
+
+        task = tasks[0]
+        if task is None:
+            time.sleep(0.1)
+        else:
+            try:
+                out = yield task
+                if out is None:
+                    break
+
+                for obj in out:
+                    updates = [None]
+                    if mp_rank_0():
+                        _, update = maybe_get_work(reply_socket)
+                        if update == _CANCEL_SENTINEL:
+                            updates = [update]
+                        else:
+                            # only send the update if it's not cancelled otherwise the object sits in the socket
+                            # and gets pulled in the next request lol
+                            send_obj(obj)
+
+                    torch.distributed.broadcast_object_list(
+                        updates,
+                        src=get_model_parallel_src_rank(),
+                        group=get_model_parallel_group(),
+                    )
+                    if updates[0] == _CANCEL_SENTINEL:
+                        print("quitting generation loop because request was cancelled")
+                        break
+
+                if mp_rank_0():
+                    send_obj(_END_SENTINEL)
+            except Exception as e:
+                print(f"[debug] got exception {e}")
+                import traceback
+
+                traceback.print_exc()
+                if mp_rank_0():
+                    send_obj(e)
+
+    if mp_rank_0():
+        send_obj("DONE")
+
+
+def maybe_get_work(sock: zmq.Socket):
+    message = None
+    client_id = None
+    try:
+        client_id, obj = sock.recv_multipart(zmq.NOBLOCK)
+        message = pickle.loads(obj)
+    except zmq.ZMQError as e:
+        if e.errno != zmq.EAGAIN:
+            raise e
+
+    return client_id, message
+
+
+def worker_process_entrypoint(
+    reply_socket_url: str,
+    init_model_cb: Callable,
+) -> None:
+    model = init_model_cb()
+    torch.distributed.barrier()
+    time.sleep(1)
+
+    # run the requests co-routine which retrieves requests from the socket
+    # and sends responses (we provide) back to the caller
+    req_gen = retrieve_requests(reply_socket_url)
+    result = None
+    while True:
+        try:
+            task = req_gen.send(result)
+            if isinstance(task, str) and task == _END_SENTINEL:
+                break
+
+            result = model(task)
+        except StopIteration:
+            break
+
+    print("[debug] worker process done")
+
+
+def launch_dist_group(
+    reply_socket_url: str,
+    model_parallel_size: int,
+    init_model_cb: Callable,
+    **kwargs,
+) -> None:
+    id = uuid.uuid4().hex
+    dist_url = f"file:///tmp/llama3_{id}_{time.time()}"
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # TODO: track workers and if they terminate, tell parent process about it so cleanup can happen
+        launch_config = LaunchConfig(
+            max_nodes=1,
+            min_nodes=1,
+            nproc_per_node=model_parallel_size,
+            start_method="fork",
+            rdzv_backend="c10d",
+            rdzv_endpoint=os.path.join(tmpdir, "rdzv"),
+            rdzv_configs={"store_type": "file", "timeout": 90},
+            max_restarts=0,
+            monitor_interval=1,
+            run_id=str(uuid.uuid4()),
+        )
+        elastic_launch(launch_config, entrypoint=worker_process_entrypoint)(
+            reply_socket_url,
+            init_model_cb,
+        )
+
+
+def start_model_parallel_process(
+    model_parallel_size: int,
+    init_model_cb: Callable,
+    **kwargs,
+):
+    context = zmq.Context()
+    request_socket = context.socket(zmq.DEALER)
+
+    # Binding the request socket to a random port
+    request_socket.bind("tcp://127.0.0.1:0")
+
+    main_process_url = request_socket.getsockopt_string(zmq.LAST_ENDPOINT)
+
+    ctx = multiprocessing.get_context("fork")
+    process = ctx.Process(
+        target=launch_dist_group,
+        args=(
+            main_process_url,
+            model_parallel_size,
+            init_model_cb,
+        ),
+        kwargs=kwargs,
+    )
+    process.start()
+
+    # wait until the model is loaded; rank 0 will send a message to indicate it's ready
+
+    request_socket.send_pyobj("READY?")
+    response = request_socket.recv_pyobj()
+    print(f"Finished model load {response}")
+
+    return request_socket, process
+
+
+class ModelParallelProcessGroup:
+    def __init__(
+        self,
+        model_parallel_size: int,
+        init_model_cb: Callable,
+        **kwargs,
+    ):
+        self.model_parallel_size = model_parallel_size
+        self.init_model_cb = init_model_cb
+        self.started = False
+        self.running = False
+
+    def start(self):
+        assert not self.started, "process group already started"
+        self.request_socket, self.process = start_model_parallel_process(
+            self.model_parallel_size,
+            self.init_model_cb,
+        )
+        self.started = True
+
+    def stop(self):
+        assert self.started, "process group not started"
+        if self.process.is_alive():
+            self.request_socket.send_pyobj(_END_SENTINEL, zmq.NOBLOCK)
+            self.process.join()
+        self.started = False
+
+    def run_inference(self, request) -> Generator:
+        assert not self.running, "inference already running"
+
+        self.running = True
+        self.request_socket.send_pyobj(request)
+        try:
+            while True:
+                obj = self.request_socket.recv_pyobj()
+                if obj == _END_SENTINEL:
+                    break
+
+                if isinstance(obj, Exception):
+                    print(f"[debug] got exception {obj}")
+                    raise obj
+
+                yield obj
+        except GeneratorExit as e:
+            self.request_socket.send_pyobj(_CANCEL_SENTINEL)
+            while True:
+                obj = self.request_socket.recv_pyobj()
+                if obj == _END_SENTINEL:
+                    break
+        finally:
+            self.running = False
--- a/llama_stack/inference/prepare_messages.py
+++ b/llama_stack/inference/prepare_messages.py
@ -0,0 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.inference.api import *  # noqa: F403
+from llama_models.llama3.prompt_templates import (
+    BuiltinToolGenerator,
+    FunctionTagCustomToolGenerator,
+    JsonCustomToolGenerator,
+    SystemDefaultGenerator,
+)
+
+
+def prepare_messages(request: ChatCompletionRequest) -> List[Message]:
+
+    assert request.tool_choice == ToolChoice.auto, "Only `ToolChoice.auto` supported"
+
+    existing_messages = request.messages
+    existing_system_message = None
+    if existing_messages[0].role == Role.system.value:
+        existing_system_message = existing_messages.pop(0)
+
+    assert (
+        existing_messages[0].role != Role.system.value
+    ), "Should only have 1 system message"
+
+    messages = []
+
+    default_gen = SystemDefaultGenerator()
+    default_template = default_gen.gen()
+
+    sys_content = ""
+
+    tool_template = None
+    if request.tools:
+        tool_gen = BuiltinToolGenerator()
+        tool_template = tool_gen.gen(request.tools)
+
+        sys_content += tool_template.render()
+        sys_content += "\n"
+
+    sys_content += default_template.render()
+
+    if existing_system_message:
+        # TODO: this fn is needed in many places
+        def _process(c):
+            if isinstance(c, str):
+                return c
+            else:
+                return "<media>"
+
+        sys_content += "\n"
+
+        if isinstance(existing_system_message.content, str):
+            sys_content += _process(existing_system_message.content)
+        elif isinstance(existing_system_message.content, list):
+            sys_content += "\n".join(
+                [_process(c) for c in existing_system_message.content]
+            )
+
+    messages.append(SystemMessage(content=sys_content))
+
+    has_custom_tools = any(isinstance(dfn.tool_name, str) for dfn in request.tools)
+    if has_custom_tools:
+        if request.tool_prompt_format == ToolPromptFormat.json:
+            tool_gen = JsonCustomToolGenerator()
+        elif request.tool_prompt_format == ToolPromptFormat.function_tag:
+            tool_gen = FunctionTagCustomToolGenerator()
+        else:
+            raise ValueError(
+                f"Non supported ToolPromptFormat {request.tool_prompt_format}"
+            )
+
+        custom_tools = [t for t in request.tools if isinstance(t.tool_name, str)]
+        custom_template = tool_gen.gen(custom_tools)
+        messages.append(UserMessage(content=custom_template.render()))
+
+    # Add back existing messages from the request
+    messages += existing_messages
+
+    return messages
--- a/llama_stack/inference/providers.py
+++ b/llama_stack/inference/providers.py
@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.core.datatypes import *  # noqa: F403
+
+
+def available_providers() -> List[ProviderSpec]:
+    return [
+        InlineProviderSpec(
+            api=Api.inference,
+            provider_id="meta-reference",
+            pip_packages=[
+                "accelerate",
+                "blobfile",
+                "codeshield",
+                "fairscale",
+                "fbgemm-gpu==0.8.0",
+                "torch",
+                "transformers",
+                "zmq",
+            ],
+            module="llama_stack.inference.meta_reference",
+            config_class="llama_stack.inference.meta_reference.MetaReferenceImplConfig",
+        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_id="ollama",
+                pip_packages=["ollama"],
+                module="llama_stack.inference.adapters.ollama",
+            ),
+        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_id="tgi",
+                pip_packages=["huggingface_hub"],
+                module="llama_stack.inference.adapters.tgi",
+                config_class="llama_stack.inference.adapters.tgi.TGIImplConfig",
+            ),
+        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_id="fireworks",
+                pip_packages=[
+                    "fireworks-ai",
+                ],
+                module="llama_stack.inference.adapters.fireworks",
+                config_class="llama_stack.inference.adapters.fireworks.FireworksImplConfig",
+            ),
+        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_id="together",
+                pip_packages=[
+                    "together",
+                ],
+                module="llama_stack.inference.adapters.together",
+                config_class="llama_stack.inference.adapters.together.TogetherImplConfig",
+            ),
+        ),
+    ]
--- a/llama_stack/inference/quantization/fp8_impls.py
+++ b/llama_stack/inference/quantization/fp8_impls.py
@ -0,0 +1,184 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import collections
+from typing import Optional, Type
+
+try:
+    import fbgemm_gpu.experimental.gen_ai  # noqa: F401
+
+    print("Using efficient FP8 operators in FBGEMM.")
+except ImportError:
+    print("No efficient FP8 operators. Please install FBGEMM in fp8_requirements.txt.")
+    raise
+
+import torch
+from torch import nn, Tensor
+
+
+class Fp8ScaledWeights:
+    # TODO: Ugly trick so torch allows us to replace parameters
+    # with our custom Fp8Weights instance. Do this properly.
+    @property
+    def __class__(self) -> Type[nn.parameter.Parameter]:
+        return nn.Parameter
+
+    @property
+    def grad_fn(self) -> None:
+        return None
+
+
+# pyre-fixme[4]: Attribute annotation cannot be `Any`.
+# pyre-fixme[2]: Parameter annotation cannot be `Any`.
+class Fp8RowwiseWeights(
+    Fp8ScaledWeights,
+    collections.namedtuple(
+        "Fp8RowwiseWeights",
+        ["weight", "scale", "shape", "activation_scale_ub"],
+    ),
+):
+    pass
+
+
+def ffn_swiglu(
+    x: Tensor,
+    w1: Fp8RowwiseWeights,
+    w3: Fp8RowwiseWeights,
+    w2: Fp8RowwiseWeights,
+    num_tokens: Optional[Tensor] = None,
+    is_memory_bounded: bool = False,
+) -> Tensor:
+    if (
+        isinstance(w1, Fp8ScaledWeights)
+        and isinstance(w3, Fp8ScaledWeights)
+        and isinstance(w2, Fp8ScaledWeights)
+    ):
+        return ffn_swiglu_fp8_dynamic(
+            x, w1, w3, w2, w1.activation_scale_ub, num_tokens, is_memory_bounded
+        )
+
+    (B, T, D) = x.shape  # noqa: N806
+    (HD_L, D_) = w1.shape  # noqa: N806
+    assert D_ == D
+
+    assert isinstance(w1, Tensor)
+    assert isinstance(w3, Tensor)
+    x1 = x.view(B * T, D) @ w1.T
+    x2 = x.view(B * T, D) @ w3.T
+    z = torch.nn.functional.silu(x1) * x2
+    del x1, x2
+    assert isinstance(w2, Tensor)
+    return (z @ w2.T).view(B, T, D)
+
+
+@torch.inference_mode()
+def quantize_fp8(
+    w: Tensor,
+    fp8_activation_scale_ub: float,
+    output_device: Optional[torch.device] = None,
+) -> Fp8RowwiseWeights:
+    """Quantize [n, k] weight tensor.
+
+    Args:
+        w (Tensor): [n, k] input high precision tensor to quantize.
+        fp8_activation_scale_ub (float): Upper bound for activation max.
+    """
+    activation_scale_ub = torch.tensor(
+        [fp8_activation_scale_ub],
+        dtype=torch.float,
+        device="cuda",
+    )
+    wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
+    del w
+    return Fp8RowwiseWeights(
+        weight=wq,
+        scale=w_scale,
+        shape=wq.shape,
+        activation_scale_ub=activation_scale_ub,
+    )
+
+
+@torch.inference_mode()
+def load_fp8(
+    w: Tensor,
+    w_scale: Tensor,
+    fp8_activation_scale_ub: float,
+) -> Fp8RowwiseWeights:
+    """Load FP8 [n, k] weight tensor.
+
+    Args:
+        w (Tensor): [n, k] input FP8.
+        fp8_activation_scale_ub (float): Upper bound for activation max.
+    """
+    activation_scale_ub = torch.tensor(
+        [fp8_activation_scale_ub],
+        dtype=torch.float,
+        device="cuda",
+    )
+    return Fp8RowwiseWeights(
+        weight=w.to(torch.float8_e4m3fn).to(device="cuda"),
+        scale=w_scale.to(device="cuda"),
+        shape=w.shape,
+        activation_scale_ub=activation_scale_ub,
+    )
+
+
+def fc_fp8_dynamic(
+    x: Tensor,
+    w: Fp8RowwiseWeights,
+    activation_scale_ub: Optional[Tensor] = None,
+    num_tokens: Optional[Tensor] = None,
+    is_memory_bounded: bool = False,
+) -> Tensor:
+    """
+    Single w8a8 fc layer with dynamic row-wise scaling.
+    """
+    if isinstance(w, Fp8RowwiseWeights):
+        xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
+            x, num_tokens, activation_scale_ub
+        )
+        y = torch.ops.fbgemm.f8f8bf16_rowwise(
+            xq, w.weight, x_scale, w.scale, use_fast_accum=True
+        )
+    del xq
+    return y
+
+
+def ffn_swiglu_fp8_dynamic(
+    x: Tensor,
+    w1: Fp8RowwiseWeights,
+    w3: Fp8RowwiseWeights,
+    w2: Fp8RowwiseWeights,
+    activation_scale_ub: Optional[Tensor] = None,
+    num_tokens: Optional[Tensor] = None,
+    is_memory_bounded: bool = False,
+) -> Tensor:
+    (B, T, D) = x.shape  # noqa: N806
+    HD_L = w1.shape[0]  # noqa: N806
+    assert HD_L == w3.shape[0]
+    x1 = fc_fp8_dynamic(
+        x.view(B * T, D),
+        w1,
+        activation_scale_ub,
+        num_tokens,
+        is_memory_bounded,
+    )
+    x2 = fc_fp8_dynamic(
+        x.view(B * T, D),
+        w3,
+        activation_scale_ub,
+        num_tokens,
+        is_memory_bounded,
+    )
+    z = torch.nn.functional.silu(x1) * x2
+    del x1, x2
+
+    z_ = fc_fp8_dynamic(z, w2, activation_scale_ub, num_tokens, is_memory_bounded)
+
+    return z_.view(B, T, D)
--- a/llama_stack/inference/quantization/loader.py
+++ b/llama_stack/inference/quantization/loader.py
@ -0,0 +1,105 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import os
+from typing import Optional
+
+import torch
+
+from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_region
+from llama_models.llama3.api.model import Transformer, TransformerBlock
+from llama_stack.inference.api import QuantizationType
+
+from llama_stack.inference.api.config import (
+    CheckpointQuantizationFormat,
+    MetaReferenceImplConfig,
+)
+
+from termcolor import cprint
+from torch import Tensor
+
+
+def is_fbgemm_available() -> bool:
+    try:
+        import fbgemm_gpu.experimental.gen_ai  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def swiglu_wrapper(
+    self,
+    x: Tensor,
+):
+    from .fp8_impls import ffn_swiglu
+
+    out = ffn_swiglu(x, self.w1.weight, self.w3.weight, self.w2.weight)
+    return reduce_from_model_parallel_region(out)
+
+
+def convert_to_quantized_model(
+    model: Transformer,
+    config: MetaReferenceImplConfig,
+    fp8_activation_scale_ub: Optional[float] = 1200.0,
+) -> Transformer:
+    if config.quantization.type == QuantizationType.bf16.value:
+        return model
+
+    elif config.quantization.type != QuantizationType.fp8.value:
+        raise ValueError("Only FP8 quantization is supported")
+
+    from .fp8_impls import Fp8ScaledWeights, load_fp8, quantize_fp8
+
+    checkpoint = config.checkpoint_config.checkpoint
+    # Move weights to GPU with quantization
+    if checkpoint.quantization_format == CheckpointQuantizationFormat.fp8_mixed.value:
+        cprint("Loading fp8 scales...", "yellow")
+        fp8_scales_path = os.path.join(
+            checkpoint.checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt"
+        )
+        assert os.path.isfile(
+            fp8_scales_path
+        ), f"fp8_scales_path not found for rank {get_model_parallel_rank()}"
+        fp8_scales = torch.load(fp8_scales_path, weights_only=True)
+
+        for block in model.layers:
+            if isinstance(block, TransformerBlock):
+                if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
+                    continue
+
+                block.feed_forward.forward = swiglu_wrapper.__get__(block.feed_forward)
+                for key in ("w1", "w3", "w2"):
+                    param = getattr(block.feed_forward, key)
+                    param.weight = load_fp8(
+                        param.weight,
+                        fp8_scales[
+                            f"{block.layer_id}_feed_forward.{key}_{get_model_parallel_rank()}"
+                        ],
+                        fp8_activation_scale_ub,
+                    )
+    else:
+        cprint("Quantizing fp8 weights from bf16...", "yellow")
+        for block in model.layers:
+            if isinstance(block, TransformerBlock):
+                if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
+                    continue
+                block.feed_forward.forward = swiglu_wrapper.__get__(block.feed_forward)
+                for key in ("w1", "w3", "w2"):
+                    param = getattr(block.feed_forward, key)
+                    param.weight = quantize_fp8(
+                        param.weight,
+                        fp8_activation_scale_ub,
+                        output_device=torch.device("cuda"),
+                    )
+
+    for _, parameter in model.named_parameters():
+        if not isinstance(parameter, Fp8ScaledWeights):
+            parameter.data = parameter.to(device="cuda")
+    return model
--- a/llama_stack/inference/quantization/scripts/build_conda.sh
+++ b/llama_stack/inference/quantization/scripts/build_conda.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+
+if [[ $# -ne 1 ]]; then
+    echo "Error: Please provide the name of CONDA environment you wish to create"
+    exit 1
+fi
+
+ENV_NAME=$1
+
+set -eu
+eval "$(conda shell.bash hook)"
+
+echo "Will build env (or overwrite) named '$ENV_NAME'"
+
+set -x
+
+run_build() {
+    # Set up the conda environment
+    yes | conda remove --name $ENV_NAME --all
+    yes | conda create -n $ENV_NAME python=3.10
+    conda activate $ENV_NAME
+
+    # PT nightly
+    pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+
+    # install dependencies for `llama-agentic-system`
+    pip install -r fp8_requirements.txt
+}
+
+run_build
--- a/llama_stack/inference/quantization/scripts/quantize_checkpoint.py
+++ b/llama_stack/inference/quantization/scripts/quantize_checkpoint.py
@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import json
+import os
+import shutil
+import sys
+from pathlib import Path
+from typing import Optional
+
+import fire
+
+import torch
+from fairscale.nn.model_parallel.initialize import (
+    get_model_parallel_rank,
+    initialize_model_parallel,
+    model_parallel_is_initialized,
+)
+from fp8.fp8_impls import FfnQuantizeMode, quantize_fp8
+
+from llama.model import ModelArgs, Transformer, TransformerBlock
+from llama.tokenizer import Tokenizer
+from torch.nn.parameter import Parameter
+
+
+def main(
+    ckpt_dir: str,
+    tokenizer_path: str,
+    quantized_ckpt_dir: str,
+    max_seq_len: Optional[int] = 512,
+    max_batch_size: Optional[int] = 4,
+    model_parallel_size: Optional[int] = None,
+    ffn_quantize_mode: Optional[FfnQuantizeMode] = FfnQuantizeMode.FP8_ROWWISE,
+    fp8_activation_scale_ub: Optional[float] = 1200.0,
+    seed: int = 1,
+):
+    """ """
+    if not os.path.exists(quantized_ckpt_dir):
+        os.makedirs(quantized_ckpt_dir)
+        shutil.copy(
+            os.path.join(ckpt_dir, "params.json"),
+            os.path.join(quantized_ckpt_dir, "params.json"),
+        )
+        shutil.copy(
+            os.path.join(ckpt_dir, "tokenizer.model"),
+            os.path.join(quantized_ckpt_dir, "tokenizer.model"),
+        )
+
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group("nccl")
+        if not model_parallel_is_initialized():
+            if model_parallel_size is None:
+                model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
+            initialize_model_parallel(model_parallel_size)
+
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        torch.cuda.set_device(local_rank)
+
+        # seed must be the same in all processes
+        torch.manual_seed(seed)
+
+        if local_rank > 0:
+            sys.stdout = open(os.devnull, "w")
+
+        checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
+        assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
+        assert model_parallel_size == len(
+            checkpoints
+        ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
+        ckpt_path = checkpoints[get_model_parallel_rank()]
+        checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        with open(Path(ckpt_dir) / "params.json", "r") as f:
+            params = json.loads(f.read())
+
+        model_args: ModelArgs = ModelArgs(
+            max_seq_len=max_seq_len,
+            max_batch_size=max_batch_size,
+            **params,
+        )
+        tokenizer = Tokenizer(model_path=tokenizer_path)
+        assert (
+            model_args.vocab_size == tokenizer.n_words
+        ), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
+
+        # load on CPU in bf16 so that fp8 conversion does not find an unexpected (fp32, e.g.) datatype
+        torch.set_default_tensor_type(torch.BFloat16Tensor)
+
+        model = Transformer(model_args)
+        model.load_state_dict(checkpoint, strict=False)
+
+        if torch.cuda.is_bf16_supported():
+            torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
+        else:
+            torch.set_default_tensor_type(torch.cuda.HalfTensor)
+
+        print(ckpt_path)
+        assert (
+            quantized_ckpt_dir is not None
+        ), "QUantized checkpoint directory should not be None"
+        fp8_scales = {}
+        for block in model.layers:
+            if isinstance(block, TransformerBlock):
+                if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
+                    continue
+
+                fp8_weight = quantize_fp8(
+                    block.feed_forward.w1.weight,
+                    fp8_activation_scale_ub,
+                    ffn_quantize_mode,
+                    output_device=torch.device("cpu"),
+                )
+                with torch.inference_mode():
+                    block.feed_forward.w1.weight = Parameter(fp8_weight.weight)
+                fp8_scales[
+                    f"{block.layer_id}_feed_forward.w1_{get_model_parallel_rank()}"
+                ] = fp8_weight.scale
+
+                fp8_weight = quantize_fp8(
+                    block.feed_forward.w3.weight,
+                    fp8_activation_scale_ub,
+                    ffn_quantize_mode,
+                    output_device=torch.device("cpu"),
+                )
+                with torch.inference_mode():
+                    block.feed_forward.w3.weight = Parameter(fp8_weight.weight)
+                fp8_scales[
+                    f"{block.layer_id}_feed_forward.w3_{get_model_parallel_rank()}"
+                ] = fp8_weight.scale
+
+                fp8_weight = quantize_fp8(
+                    block.feed_forward.w2.weight,
+                    fp8_activation_scale_ub,
+                    ffn_quantize_mode,
+                    output_device=torch.device("cpu"),
+                )
+                with torch.inference_mode():
+                    block.feed_forward.w2.weight = Parameter(fp8_weight.weight)
+                fp8_scales[
+                    f"{block.layer_id}_feed_forward.w2_{get_model_parallel_rank()}"
+                ] = fp8_weight.scale
+
+        fp8_scales_path = os.path.join(
+            quantized_ckpt_dir, f"fp8_scales_{get_model_parallel_rank()}.pt"
+        )
+        torch.save(fp8_scales, fp8_scales_path)
+
+        ckpt_path = os.path.join(
+            quantized_ckpt_dir,
+            "consolidated.{:02d}.pth".format(get_model_parallel_rank()),
+        )
+        torch.save(model.state_dict(), ckpt_path)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/llama_stack/inference/quantization/scripts/run_quantize_checkpoint.sh
+++ b/llama_stack/inference/quantization/scripts/run_quantize_checkpoint.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+set -x
+
+cd $(git rev-parse --show-toplevel)
+
+MASTER_HOST=$1
+RUN_ID=$2
+CKPT_DIR=$3
+QUANT_CKPT_DIR=$4
+TOKENIZER_PATH=$5
+NNODES=$6
+NPROC=$7
+
+echo $MASTER_HOST, $RUN_ID, $CKPT_DIR, $QUANT_CKPT_DIR
+
+NCCL_NET=Socket NCCL_SOCKET_IFNAME=eth TIKTOKEN_CACHE_DIR="" \
+  torchrun \
+   --nnodes=$NNODES --nproc_per_node=$NPROC \
+   --rdzv_id=$RUN_ID \
+   --rdzv_conf='timeout=120' \
+   --rdzv_backend=c10d \
+   --rdzv_endpoint="${MASTER_HOST}:29502" \
+   quantize_checkpoint.py $CKPT_DIR $TOKENIZER_PATH $QUANT_CKPT_DIR
--- a/llama_stack/inference/quantization/test_fp8.py
+++ b/llama_stack/inference/quantization/test_fp8.py
@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+import unittest
+
+import torch
+
+from fp8_impls import ffn_swiglu_fp8_dynamic, FfnQuantizeMode, quantize_fp8
+from hypothesis import given, settings, strategies as st
+from torch import Tensor
+
+
+@unittest.skipIf(
+    not torch.cuda.is_available()
+    or torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9,
+    "Skip when H100 is not available",
+)
+class FP8Tests(unittest.TestCase):
+    @settings(deadline=None)
+    @given(
+        D=st.sampled_from([4096, 8192]),
+        HD_L=st.sampled_from([1280, 2560]),
+        B=st.sampled_from([1, 2]),
+        T=st.sampled_from([2048, 4096]),
+        UB=st.sampled_from([1000, 10000]),
+    )
+    def test_fp8_ffn(
+        self,
+        D: int,  # noqa
+        HD_L: int,
+        B: int,
+        T: int,
+        UB: float,
+    ) -> None:
+        x = torch.randn(size=(B, T, D), dtype=torch.bfloat16, device="cuda") * 0.1
+        w1 = torch.randn(size=(HD_L, D), dtype=torch.bfloat16, device="cuda") * 0.01
+        w3 = torch.randn(size=(HD_L, D), dtype=torch.bfloat16, device="cuda") * 0.01
+        w2 = torch.randn(size=(D, HD_L), dtype=torch.bfloat16, device="cuda") * 0.1
+
+        x_q = quantize_fp8(x, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
+        w1_q = quantize_fp8(w1, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
+        w3_q = quantize_fp8(w3, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
+        w2_q = quantize_fp8(w2, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
+
+        def ref_ffn(x: Tensor, w1: Tensor, w3: Tensor, w2: Tensor) -> Tensor:
+            (B, T, D) = x.shape  # noqa: N806
+            (HD_L, D_) = w1.shape  # noqa: N806
+            assert D_ == D
+
+            x1 = x.view(B * T, D) @ w1.T
+            x2 = x.view(B * T, D) @ w3.T
+
+            z = torch.nn.functional.silu(x1) * x2
+            return (z @ w2.T).view(B, T, D).to(torch.bfloat16)
+
+        v = ffn_swiglu_fp8_dynamic(x, w1_q, w3_q, w2_q)
+
+        # Fake quant
+        x = x_q.weight.bfloat16() * x_q.scale.unsqueeze(-1)
+        w1 = w1_q.weight.bfloat16() * w1_q.scale.unsqueeze(-1)
+        w3 = w3_q.weight.bfloat16() * w3_q.scale.unsqueeze(-1)
+        w2 = w2_q.weight.bfloat16() * w2_q.scale.unsqueeze(-1)
+
+        v_ref = ref_ffn(x, w1, w3, w2)
+
+        torch.testing.assert_close(v_ref, v, atol=4.0e-3, rtol=4.0e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/Show more
+++ b/Show more