Merge branch 'main' into opengauss-add

2025-12-23 01:02:25 +00:00 · 2025-08-08 20:58:48 +08:00 · 2025-08-08 20:58:48 +08:00 · 39e49ab97a
commit 39e49ab97a
parent 5e9c394500 9e78f2da96
807 changed files with 79555 additions and 26772 deletions
--- a/llama_stack/init.py
+++ b/llama_stack/init.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.distribution.library_client import (  # noqa: F401
+from llama_stack.core.library_client import (  # noqa: F401
    AsyncLlamaStackAsLibraryClient,
    LlamaStackAsLibraryClient,
 )
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -152,7 +152,17 @@ Step = Annotated[

@json_schema_type
 class Turn(BaseModel):
-    """A single turn in an interaction with an Agentic System."""
+    """A single turn in an interaction with an Agentic System.
+
+    :param turn_id: Unique identifier for the turn within a session
+    :param session_id: Unique identifier for the conversation session
+    :param input_messages: List of messages that initiated this turn
+    :param steps: Ordered list of processing steps executed during this turn
+    :param output_message: The model's generated response containing content and metadata
+    :param output_attachments: (Optional) Files or media attached to the agent's response
+    :param started_at: Timestamp when the turn began
+    :param completed_at: (Optional) Timestamp when the turn finished, if completed
+    """

    turn_id: str
    session_id: str
@ -167,7 +177,13 @@ class Turn(BaseModel):

@json_schema_type
 class Session(BaseModel):
-    """A single session of an interaction with an Agentic System."""
+    """A single session of an interaction with an Agentic System.
+
+    :param session_id: Unique identifier for the conversation session
+    :param session_name: Human-readable name for the session
+    :param turns: List of all turns that have occurred in this session
+    :param started_at: Timestamp when the session was created
+    """

    session_id: str
    session_name: str
@ -232,6 +248,13 @@ class AgentConfig(AgentConfigCommon):

@json_schema_type
 class Agent(BaseModel):
+    """An agent instance with configuration and metadata.
+
+    :param agent_id: Unique identifier for the agent
+    :param agent_config: Configuration settings for the agent
+    :param created_at: Timestamp when the agent was created
+    """
+
    agent_id: str
    agent_config: AgentConfig
    created_at: datetime
@ -253,6 +276,14 @@ class AgentTurnResponseEventType(StrEnum):

@json_schema_type
 class AgentTurnResponseStepStartPayload(BaseModel):
+    """Payload for step start events in agent turn responses.
+
+    :param event_type: Type of event being reported
+    :param step_type: Type of step being executed
+    :param step_id: Unique identifier for the step within a turn
+    :param metadata: (Optional) Additional metadata for the step
+    """
+
    event_type: Literal[AgentTurnResponseEventType.step_start] = AgentTurnResponseEventType.step_start
    step_type: StepType
    step_id: str
@ -261,6 +292,14 @@ class AgentTurnResponseStepStartPayload(BaseModel):

@json_schema_type
 class AgentTurnResponseStepCompletePayload(BaseModel):
+    """Payload for step completion events in agent turn responses.
+
+    :param event_type: Type of event being reported
+    :param step_type: Type of step being executed
+    :param step_id: Unique identifier for the step within a turn
+    :param step_details: Complete details of the executed step
+    """
+
    event_type: Literal[AgentTurnResponseEventType.step_complete] = AgentTurnResponseEventType.step_complete
    step_type: StepType
    step_id: str
@ -269,6 +308,14 @@ class AgentTurnResponseStepCompletePayload(BaseModel):

@json_schema_type
 class AgentTurnResponseStepProgressPayload(BaseModel):
+    """Payload for step progress events in agent turn responses.
+
+    :param event_type: Type of event being reported
+    :param step_type: Type of step being executed
+    :param step_id: Unique identifier for the step within a turn
+    :param delta: Incremental content changes during step execution
+    """
+
    model_config = ConfigDict(protected_namespaces=())

    event_type: Literal[AgentTurnResponseEventType.step_progress] = AgentTurnResponseEventType.step_progress
@ -280,18 +327,36 @@ class AgentTurnResponseStepProgressPayload(BaseModel):

@json_schema_type
 class AgentTurnResponseTurnStartPayload(BaseModel):
+    """Payload for turn start events in agent turn responses.
+
+    :param event_type: Type of event being reported
+    :param turn_id: Unique identifier for the turn within a session
+    """
+
    event_type: Literal[AgentTurnResponseEventType.turn_start] = AgentTurnResponseEventType.turn_start
    turn_id: str


@json_schema_type
 class AgentTurnResponseTurnCompletePayload(BaseModel):
+    """Payload for turn completion events in agent turn responses.
+
+    :param event_type: Type of event being reported
+    :param turn: Complete turn data including all steps and results
+    """
+
    event_type: Literal[AgentTurnResponseEventType.turn_complete] = AgentTurnResponseEventType.turn_complete
    turn: Turn


@json_schema_type
 class AgentTurnResponseTurnAwaitingInputPayload(BaseModel):
+    """Payload for turn awaiting input events in agent turn responses.
+
+    :param event_type: Type of event being reported
+    :param turn: Turn data when waiting for external tool responses
+    """
+
    event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input] = AgentTurnResponseEventType.turn_awaiting_input
    turn: Turn

@ -310,21 +375,47 @@ register_schema(AgentTurnResponseEventPayload, name="AgentTurnResponseEventPaylo

@json_schema_type
 class AgentTurnResponseEvent(BaseModel):
+    """An event in an agent turn response stream.
+
+    :param payload: Event-specific payload containing event data
+    """
+
    payload: AgentTurnResponseEventPayload


@json_schema_type
 class AgentCreateResponse(BaseModel):
+    """Response returned when creating a new agent.
+
+    :param agent_id: Unique identifier for the created agent
+    """
+
    agent_id: str


@json_schema_type
 class AgentSessionCreateResponse(BaseModel):
+    """Response returned when creating a new agent session.
+
+    :param session_id: Unique identifier for the created session
+    """
+
    session_id: str


@json_schema_type
 class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
+    """Request to create a new turn for an agent.
+
+    :param agent_id: Unique identifier for the agent
+    :param session_id: Unique identifier for the conversation session
+    :param messages: List of messages to start the turn with
+    :param documents: (Optional) List of documents to provide to the agent
+    :param toolgroups: (Optional) List of tool groups to make available for this turn
+    :param stream: (Optional) Whether to stream the response
+    :param tool_config: (Optional) Tool configuration to override agent defaults
+    """
+
    agent_id: str
    session_id: str

@ -342,6 +433,15 @@ class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):

@json_schema_type
 class AgentTurnResumeRequest(BaseModel):
+    """Request to resume an agent turn with tool responses.
+
+    :param agent_id: Unique identifier for the agent
+    :param session_id: Unique identifier for the conversation session
+    :param turn_id: Unique identifier for the turn within a session
+    :param tool_responses: List of tool responses to submit to continue the turn
+    :param stream: (Optional) Whether to stream the response
+    """
+
    agent_id: str
    session_id: str
    turn_id: str
@ -351,13 +451,21 @@ class AgentTurnResumeRequest(BaseModel):

@json_schema_type
 class AgentTurnResponseStreamChunk(BaseModel):
-    """streamed agent turn completion response."""
+    """Streamed agent turn completion response.
+
+    :param event: Individual event in the agent turn response stream
+    """

    event: AgentTurnResponseEvent


@json_schema_type
 class AgentStepResponse(BaseModel):
+    """Response containing details of a specific agent step.
+
+    :param step: The complete step data and execution details
+    """
+
    step: Step


--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -18,18 +18,37 @@ from llama_stack.schema_utils import json_schema_type, register_schema

@json_schema_type
 class OpenAIResponseError(BaseModel):
+    """Error details for failed OpenAI response requests.
+
+    :param code: Error code identifying the type of failure
+    :param message: Human-readable error message describing the failure
+    """
+
    code: str
    message: str


@json_schema_type
 class OpenAIResponseInputMessageContentText(BaseModel):
+    """Text content for input messages in OpenAI response format.
+
+    :param text: The text content of the input message
+    :param type: Content type identifier, always "input_text"
+    """
+
    text: str
    type: Literal["input_text"] = "input_text"


@json_schema_type
 class OpenAIResponseInputMessageContentImage(BaseModel):
+    """Image content for input messages in OpenAI response format.
+
+    :param detail: Level of detail for image processing, can be "low", "high", or "auto"
+    :param type: Content type identifier, always "input_image"
+    :param image_url: (Optional) URL of the image content
+    """
+
    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
    type: Literal["input_image"] = "input_image"
    # TODO: handle file_id
@ -46,6 +65,14 @@ register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMess

@json_schema_type
 class OpenAIResponseAnnotationFileCitation(BaseModel):
+    """File citation annotation for referencing specific files in response content.
+
+    :param type: Annotation type identifier, always "file_citation"
+    :param file_id: Unique identifier of the referenced file
+    :param filename: Name of the referenced file
+    :param index: Position index of the citation within the content
+    """
+
    type: Literal["file_citation"] = "file_citation"
    file_id: str
    filename: str
@ -54,6 +81,15 @@ class OpenAIResponseAnnotationFileCitation(BaseModel):

@json_schema_type
 class OpenAIResponseAnnotationCitation(BaseModel):
+    """URL citation annotation for referencing external web resources.
+
+    :param type: Annotation type identifier, always "url_citation"
+    :param end_index: End position of the citation span in the content
+    :param start_index: Start position of the citation span in the content
+    :param title: Title of the referenced web resource
+    :param url: URL of the referenced web resource
+    """
+
    type: Literal["url_citation"] = "url_citation"
    end_index: int
    start_index: int
@ -122,6 +158,13 @@ class OpenAIResponseMessage(BaseModel):

@json_schema_type
 class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
+    """Web search tool call output message for OpenAI responses.
+
+    :param id: Unique identifier for this tool call
+    :param status: Current status of the web search operation
+    :param type: Tool call type identifier, always "web_search_call"
+    """
+
    id: str
    status: str
    type: Literal["web_search_call"] = "web_search_call"
@ -129,6 +172,15 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):

@json_schema_type
 class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
+    """File search tool call output message for OpenAI responses.
+
+    :param id: Unique identifier for this tool call
+    :param queries: List of search queries executed
+    :param status: Current status of the file search operation
+    :param type: Tool call type identifier, always "file_search_call"
+    :param results: (Optional) Search results returned by the file search operation
+    """
+
    id: str
    queries: list[str]
    status: str
@ -138,6 +190,16 @@ class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):

@json_schema_type
 class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):
+    """Function tool call output message for OpenAI responses.
+
+    :param call_id: Unique identifier for the function call
+    :param name: Name of the function being called
+    :param arguments: JSON string containing the function arguments
+    :param type: Tool call type identifier, always "function_call"
+    :param id: (Optional) Additional identifier for the tool call
+    :param status: (Optional) Current status of the function call execution
+    """
+
    call_id: str
    name: str
    arguments: str
@ -148,6 +210,17 @@ class OpenAIResponseOutputMessageFunctionToolCall(BaseModel):

@json_schema_type
 class OpenAIResponseOutputMessageMCPCall(BaseModel):
+    """Model Context Protocol (MCP) call output message for OpenAI responses.
+
+    :param id: Unique identifier for this MCP call
+    :param type: Tool call type identifier, always "mcp_call"
+    :param arguments: JSON string containing the MCP call arguments
+    :param name: Name of the MCP method being called
+    :param server_label: Label identifying the MCP server handling the call
+    :param error: (Optional) Error message if the MCP call failed
+    :param output: (Optional) Output result from the successful MCP call
+    """
+
    id: str
    type: Literal["mcp_call"] = "mcp_call"
    arguments: str
@ -158,6 +231,13 @@ class OpenAIResponseOutputMessageMCPCall(BaseModel):


 class MCPListToolsTool(BaseModel):
+    """Tool definition returned by MCP list tools operation.
+
+    :param input_schema: JSON schema defining the tool's input parameters
+    :param name: Name of the tool
+    :param description: (Optional) Description of what the tool does
+    """
+
    input_schema: dict[str, Any]
    name: str
    description: str | None = None
@ -165,6 +245,14 @@ class MCPListToolsTool(BaseModel):

@json_schema_type
 class OpenAIResponseOutputMessageMCPListTools(BaseModel):
+    """MCP list tools output message containing available tools from an MCP server.
+
+    :param id: Unique identifier for this MCP list tools operation
+    :param type: Tool call type identifier, always "mcp_list_tools"
+    :param server_label: Label identifying the MCP server providing the tools
+    :param tools: List of available tools provided by the MCP server
+    """
+
    id: str
    type: Literal["mcp_list_tools"] = "mcp_list_tools"
    server_label: str
@ -206,11 +294,34 @@ class OpenAIResponseTextFormat(TypedDict, total=False):

@json_schema_type
 class OpenAIResponseText(BaseModel):
+    """Text response configuration for OpenAI responses.
+
+    :param format: (Optional) Text format configuration specifying output format requirements
+    """
+
    format: OpenAIResponseTextFormat | None = None


@json_schema_type
 class OpenAIResponseObject(BaseModel):
+    """Complete OpenAI response object containing generation results and metadata.
+
+    :param created_at: Unix timestamp when the response was created
+    :param error: (Optional) Error details if the response generation failed
+    :param id: Unique identifier for this response
+    :param model: Model identifier used for generation
+    :param object: Object type identifier, always "response"
+    :param output: List of generated output items (messages, tool calls, etc.)
+    :param parallel_tool_calls: Whether tool calls can be executed in parallel
+    :param previous_response_id: (Optional) ID of the previous response in a conversation
+    :param status: Current status of the response generation
+    :param temperature: (Optional) Sampling temperature used for generation
+    :param text: Text formatting configuration for the response
+    :param top_p: (Optional) Nucleus sampling parameter used for generation
+    :param truncation: (Optional) Truncation strategy applied to the response
+    :param user: (Optional) User identifier associated with the request
+    """
+
    created_at: int
    error: OpenAIResponseError | None = None
    id: str
@ -231,6 +342,13 @@ class OpenAIResponseObject(BaseModel):

@json_schema_type
 class OpenAIDeleteResponseObject(BaseModel):
+    """Response object confirming deletion of an OpenAI response.
+
+    :param id: Unique identifier of the deleted response
+    :param object: Object type identifier, always "response"
+    :param deleted: Deletion confirmation flag, always True
+    """
+
    id: str
    object: Literal["response"] = "response"
    deleted: bool = True
@ -238,18 +356,39 @@ class OpenAIDeleteResponseObject(BaseModel):

@json_schema_type
 class OpenAIResponseObjectStreamResponseCreated(BaseModel):
+    """Streaming event indicating a new response has been created.
+
+    :param response: The newly created response object
+    :param type: Event type identifier, always "response.created"
+    """
+
    response: OpenAIResponseObject
    type: Literal["response.created"] = "response.created"


@json_schema_type
 class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
+    """Streaming event indicating a response has been completed.
+
+    :param response: The completed response object
+    :param type: Event type identifier, always "response.completed"
+    """
+
    response: OpenAIResponseObject
    type: Literal["response.completed"] = "response.completed"


@json_schema_type
 class OpenAIResponseObjectStreamResponseOutputItemAdded(BaseModel):
+    """Streaming event for when a new output item is added to the response.
+
+    :param response_id: Unique identifier of the response containing this output
+    :param item: The output item that was added (message, tool call, etc.)
+    :param output_index: Index position of this item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.output_item.added"
+    """
+
    response_id: str
    item: OpenAIResponseOutput
    output_index: int
@ -259,6 +398,15 @@ class OpenAIResponseObjectStreamResponseOutputItemAdded(BaseModel):

@json_schema_type
 class OpenAIResponseObjectStreamResponseOutputItemDone(BaseModel):
+    """Streaming event for when an output item is completed.
+
+    :param response_id: Unique identifier of the response containing this output
+    :param item: The completed output item (message, tool call, etc.)
+    :param output_index: Index position of this item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.output_item.done"
+    """
+
    response_id: str
    item: OpenAIResponseOutput
    output_index: int
@ -268,6 +416,16 @@ class OpenAIResponseObjectStreamResponseOutputItemDone(BaseModel):

@json_schema_type
 class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
+    """Streaming event for incremental text content updates.
+
+    :param content_index: Index position within the text content
+    :param delta: Incremental text content being added
+    :param item_id: Unique identifier of the output item being updated
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.output_text.delta"
+    """
+
    content_index: int
    delta: str
    item_id: str
@ -278,6 +436,16 @@ class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):

@json_schema_type
 class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel):
+    """Streaming event for when text output is completed.
+
+    :param content_index: Index position within the text content
+    :param text: Final complete text content of the output item
+    :param item_id: Unique identifier of the completed output item
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.output_text.done"
+    """
+
    content_index: int
    text: str  # final text of the output item
    item_id: str
@ -288,6 +456,15 @@ class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel):

@json_schema_type
 class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta(BaseModel):
+    """Streaming event for incremental function call argument updates.
+
+    :param delta: Incremental function call arguments being added
+    :param item_id: Unique identifier of the function call being updated
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.function_call_arguments.delta"
+    """
+
    delta: str
    item_id: str
    output_index: int
@ -297,6 +474,15 @@ class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta(BaseModel):

@json_schema_type
 class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone(BaseModel):
+    """Streaming event for when function call arguments are completed.
+
+    :param arguments: Final complete arguments JSON string for the function call
+    :param item_id: Unique identifier of the completed function call
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.function_call_arguments.done"
+    """
+
    arguments: str  # final arguments of the function call
    item_id: str
    output_index: int
@ -306,6 +492,14 @@ class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone(BaseModel):

@json_schema_type
 class OpenAIResponseObjectStreamResponseWebSearchCallInProgress(BaseModel):
+    """Streaming event for web search calls in progress.
+
+    :param item_id: Unique identifier of the web search call
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.web_search_call.in_progress"
+    """
+
    item_id: str
    output_index: int
    sequence_number: int
@ -322,6 +516,14 @@ class OpenAIResponseObjectStreamResponseWebSearchCallSearching(BaseModel):

@json_schema_type
 class OpenAIResponseObjectStreamResponseWebSearchCallCompleted(BaseModel):
+    """Streaming event for completed web search calls.
+
+    :param item_id: Unique identifier of the completed web search call
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.web_search_call.completed"
+    """
+
    item_id: str
    output_index: int
    sequence_number: int
@ -366,6 +568,14 @@ class OpenAIResponseObjectStreamResponseMcpCallArgumentsDone(BaseModel):

@json_schema_type
 class OpenAIResponseObjectStreamResponseMcpCallInProgress(BaseModel):
+    """Streaming event for MCP calls in progress.
+
+    :param item_id: Unique identifier of the MCP call
+    :param output_index: Index position of the item in the output list
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.mcp_call.in_progress"
+    """
+
    item_id: str
    output_index: int
    sequence_number: int
@ -374,12 +584,24 @@ class OpenAIResponseObjectStreamResponseMcpCallInProgress(BaseModel):

@json_schema_type
 class OpenAIResponseObjectStreamResponseMcpCallFailed(BaseModel):
+    """Streaming event for failed MCP calls.
+
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.mcp_call.failed"
+    """
+
    sequence_number: int
    type: Literal["response.mcp_call.failed"] = "response.mcp_call.failed"


@json_schema_type
 class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
+    """Streaming event for completed MCP calls.
+
+    :param sequence_number: Sequential number for ordering streaming events
+    :param type: Event type identifier, always "response.mcp_call.completed"
+    """
+
    sequence_number: int
    type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"

@ -442,6 +664,12 @@ WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_20

@json_schema_type
 class OpenAIResponseInputToolWebSearch(BaseModel):
+    """Web search tool configuration for OpenAI response inputs.
+
+    :param type: Web search tool type variant to use
+    :param search_context_size: (Optional) Size of search context, must be "low", "medium", or "high"
+    """
+
    # Must match values of WebSearchToolTypes above
    type: Literal["web_search"] | Literal["web_search_preview"] | Literal["web_search_preview_2025_03_11"] = (
        "web_search"
@ -453,6 +681,15 @@ class OpenAIResponseInputToolWebSearch(BaseModel):

@json_schema_type
 class OpenAIResponseInputToolFunction(BaseModel):
+    """Function tool configuration for OpenAI response inputs.
+
+    :param type: Tool type identifier, always "function"
+    :param name: Name of the function that can be called
+    :param description: (Optional) Description of what the function does
+    :param parameters: (Optional) JSON schema defining the function's parameters
+    :param strict: (Optional) Whether to enforce strict parameter validation
+    """
+
    type: Literal["function"] = "function"
    name: str
    description: str | None = None
@ -462,6 +699,15 @@ class OpenAIResponseInputToolFunction(BaseModel):

@json_schema_type
 class OpenAIResponseInputToolFileSearch(BaseModel):
+    """File search tool configuration for OpenAI response inputs.
+
+    :param type: Tool type identifier, always "file_search"
+    :param vector_store_ids: List of vector store identifiers to search within
+    :param filters: (Optional) Additional filters to apply to the search
+    :param max_num_results: (Optional) Maximum number of search results to return (1-50)
+    :param ranking_options: (Optional) Options for ranking and scoring search results
+    """
+
    type: Literal["file_search"] = "file_search"
    vector_store_ids: list[str]
    filters: dict[str, Any] | None = None
@ -470,16 +716,37 @@ class OpenAIResponseInputToolFileSearch(BaseModel):


 class ApprovalFilter(BaseModel):
+    """Filter configuration for MCP tool approval requirements.
+
+    :param always: (Optional) List of tool names that always require approval
+    :param never: (Optional) List of tool names that never require approval
+    """
+
    always: list[str] | None = None
    never: list[str] | None = None


 class AllowedToolsFilter(BaseModel):
+    """Filter configuration for restricting which MCP tools can be used.
+
+    :param tool_names: (Optional) List of specific tool names that are allowed
+    """
+
    tool_names: list[str] | None = None


@json_schema_type
 class OpenAIResponseInputToolMCP(BaseModel):
+    """Model Context Protocol (MCP) tool configuration for OpenAI response inputs.
+
+    :param type: Tool type identifier, always "mcp"
+    :param server_label: Label to identify this MCP server
+    :param server_url: URL endpoint of the MCP server
+    :param headers: (Optional) HTTP headers to include when connecting to the server
+    :param require_approval: Approval requirement for tool calls ("always", "never", or filter)
+    :param allowed_tools: (Optional) Restriction on which tools can be used from this server
+    """
+
    type: Literal["mcp"] = "mcp"
    server_label: str
    server_url: str
@ -500,17 +767,37 @@ register_schema(OpenAIResponseInputTool, name="OpenAIResponseInputTool")


 class ListOpenAIResponseInputItem(BaseModel):
+    """List container for OpenAI response input items.
+
+    :param data: List of input items
+    :param object: Object type identifier, always "list"
+    """
+
    data: list[OpenAIResponseInput]
    object: Literal["list"] = "list"


@json_schema_type
 class OpenAIResponseObjectWithInput(OpenAIResponseObject):
+    """OpenAI response object extended with input context information.
+
+    :param input: List of input items that led to this response
+    """
+
    input: list[OpenAIResponseInput]


@json_schema_type
 class ListOpenAIResponseObject(BaseModel):
+    """Paginated list of OpenAI response objects with navigation metadata.
+
+    :param data: List of response objects with their input context
+    :param has_more: Whether there are more results available beyond this page
+    :param first_id: Identifier of the first item in this page
+    :param last_id: Identifier of the last item in this page
+    :param object: Object type identifier, always "list"
+    """
+
    data: list[OpenAIResponseObjectWithInput]
    has_more: bool
    first_id: str
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -22,6 +22,14 @@ class CommonBenchmarkFields(BaseModel):

@json_schema_type
 class Benchmark(CommonBenchmarkFields, Resource):
+    """A benchmark resource for evaluating model performance.
+
+    :param dataset_id: Identifier of the dataset to use for the benchmark evaluation
+    :param scoring_functions: List of scoring function identifiers to apply during evaluation
+    :param metadata: Metadata for this evaluation task
+    :param type: The resource type, always benchmark
+    """
+
    type: Literal[ResourceType.benchmark] = ResourceType.benchmark

    @property
--- a/llama_stack/apis/common/content_types.py
+++ b/llama_stack/apis/common/content_types.py
@ -15,6 +15,11 @@ from llama_stack.schema_utils import json_schema_type, register_schema

@json_schema_type
 class URL(BaseModel):
+    """A URL reference to external content.
+
+    :param uri: The URL string pointing to the resource
+    """
+
    uri: str


@ -76,17 +81,36 @@ register_schema(InterleavedContent, name="InterleavedContent")

@json_schema_type
 class TextDelta(BaseModel):
+    """A text content delta for streaming responses.
+
+    :param type: Discriminator type of the delta. Always "text"
+    :param text: The incremental text content
+    """
+
    type: Literal["text"] = "text"
    text: str


@json_schema_type
 class ImageDelta(BaseModel):
+    """An image content delta for streaming responses.
+
+    :param type: Discriminator type of the delta. Always "image"
+    :param image: The incremental image data as bytes
+    """
+
    type: Literal["image"] = "image"
    image: bytes


 class ToolCallParseStatus(Enum):
+    """Status of tool call parsing during streaming.
+    :cvar started: Tool call parsing has begun
+    :cvar in_progress: Tool call parsing is ongoing
+    :cvar failed: Tool call parsing failed
+    :cvar succeeded: Tool call parsing completed successfully
+    """
+
    started = "started"
    in_progress = "in_progress"
    failed = "failed"
@ -95,6 +119,13 @@ class ToolCallParseStatus(Enum):

@json_schema_type
 class ToolCallDelta(BaseModel):
+    """A tool call content delta for streaming responses.
+
+    :param type: Discriminator type of the delta. Always "tool_call"
+    :param tool_call: Either an in-progress tool call string or the final parsed tool call
+    :param parse_status: Current parsing status of the tool call
+    """
+
    type: Literal["tool_call"] = "tool_call"

    # you either send an in-progress tool call so the client can stream a long
--- a/llama_stack/apis/common/errors.py
+++ b/llama_stack/apis/common/errors.py
@ -4,6 +4,21 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+# Custom Llama Stack Exception classes should follow the following schema
+#   1. All classes should inherit from an existing Built-In Exception class: https://docs.python.org/3/library/exceptions.html
+#   2. All classes should have a custom error message with the goal of informing the Llama Stack user specifically
+#   3. All classes should propogate the inherited __init__ function otherwise via 'super().__init__(message)'
+
+
+class ResourceNotFoundError(ValueError):
+    """generic exception for a missing Llama Stack resource"""
+
+    def __init__(self, resource_name: str, resource_type: str, client_list: str) -> None:
+        message = (
+            f"{resource_type} '{resource_name}' not found. Use '{client_list}' to list available {resource_type}s."
+        )
+        super().__init__(message)
+

 class UnsupportedModelError(ValueError):
    """raised when model is not present in the list of supported models"""
@ -11,3 +26,39 @@ class UnsupportedModelError(ValueError):
    def __init__(self, model_name: str, supported_models_list: list[str]):
        message = f"'{model_name}' model is not supported. Supported models are: {', '.join(supported_models_list)}"
        super().__init__(message)
+
+
+class ModelNotFoundError(ResourceNotFoundError):
+    """raised when Llama Stack cannot find a referenced model"""
+
+    def __init__(self, model_name: str) -> None:
+        super().__init__(model_name, "Model", "client.models.list()")
+
+
+class VectorStoreNotFoundError(ResourceNotFoundError):
+    """raised when Llama Stack cannot find a referenced vector store"""
+
+    def __init__(self, vector_store_name: str) -> None:
+        super().__init__(vector_store_name, "Vector Store", "client.vector_dbs.list()")
+
+
+class DatasetNotFoundError(ResourceNotFoundError):
+    """raised when Llama Stack cannot find a referenced dataset"""
+
+    def __init__(self, dataset_name: str) -> None:
+        super().__init__(dataset_name, "Dataset", "client.datasets.list()")
+
+
+class ToolGroupNotFoundError(ResourceNotFoundError):
+    """raised when Llama Stack cannot find a referenced tool group"""
+
+    def __init__(self, toolgroup_name: str) -> None:
+        super().__init__(toolgroup_name, "Tool Group", "client.toolgroups.list()")
+
+
+class SessionNotFoundError(ValueError):
+    """raised when Llama Stack cannot find a referenced session or access is denied"""
+
+    def __init__(self, session_name: str) -> None:
+        message = f"Session '{session_name}' not found or access denied."
+        super().__init__(message)
--- a/llama_stack/apis/common/job_types.py
+++ b/llama_stack/apis/common/job_types.py
@ -11,6 +11,14 @@ from llama_stack.schema_utils import json_schema_type


 class JobStatus(Enum):
+    """Status of a job execution.
+    :cvar completed: Job has finished successfully
+    :cvar in_progress: Job is currently running
+    :cvar failed: Job has failed during execution
+    :cvar scheduled: Job is scheduled but not yet started
+    :cvar cancelled: Job was cancelled before completion
+    """
+
    completed = "completed"
    in_progress = "in_progress"
    failed = "failed"
@ -20,5 +28,11 @@ class JobStatus(Enum):

@json_schema_type
 class Job(BaseModel):
+    """A job execution instance with status tracking.
+
+    :param job_id: Unique identifier for the job
+    :param status: Current execution status of the job
+    """
+
    job_id: str
    status: JobStatus
--- a/llama_stack/apis/common/responses.py
+++ b/llama_stack/apis/common/responses.py
@ -13,6 +13,11 @@ from llama_stack.schema_utils import json_schema_type


 class Order(Enum):
+    """Sort order for paginated responses.
+    :cvar asc: Ascending order
+    :cvar desc: Descending order
+    """
+
    asc = "asc"
    desc = "desc"

--- a/llama_stack/apis/common/training_types.py
+++ b/llama_stack/apis/common/training_types.py
@ -13,6 +13,14 @@ from llama_stack.schema_utils import json_schema_type

@json_schema_type
 class PostTrainingMetric(BaseModel):
+    """Training metrics captured during post-training jobs.
+
+    :param epoch: Training epoch number
+    :param train_loss: Loss value on the training dataset
+    :param validation_loss: Loss value on the validation dataset
+    :param perplexity: Perplexity metric indicating model confidence
+    """
+
    epoch: int
    train_loss: float
    validation_loss: float
@ -21,7 +29,15 @@ class PostTrainingMetric(BaseModel):

@json_schema_type
 class Checkpoint(BaseModel):
-    """Checkpoint created during training runs"""
+    """Checkpoint created during training runs.
+
+    :param identifier: Unique identifier for the checkpoint
+    :param created_at: Timestamp when the checkpoint was created
+    :param epoch: Training epoch when the checkpoint was saved
+    :param post_training_job_id: Identifier of the training job that created this checkpoint
+    :param path: File system path where the checkpoint is stored
+    :param training_metrics: (Optional) Training metrics associated with this checkpoint
+    """

    identifier: str
    created_at: datetime
--- a/llama_stack/apis/common/type_system.py
+++ b/llama_stack/apis/common/type_system.py
@ -13,59 +13,114 @@ from llama_stack.schema_utils import json_schema_type, register_schema

@json_schema_type
 class StringType(BaseModel):
+    """Parameter type for string values.
+
+    :param type: Discriminator type. Always "string"
+    """
+
    type: Literal["string"] = "string"


@json_schema_type
 class NumberType(BaseModel):
+    """Parameter type for numeric values.
+
+    :param type: Discriminator type. Always "number"
+    """
+
    type: Literal["number"] = "number"


@json_schema_type
 class BooleanType(BaseModel):
+    """Parameter type for boolean values.
+
+    :param type: Discriminator type. Always "boolean"
+    """
+
    type: Literal["boolean"] = "boolean"


@json_schema_type
 class ArrayType(BaseModel):
+    """Parameter type for array values.
+
+    :param type: Discriminator type. Always "array"
+    """
+
    type: Literal["array"] = "array"


@json_schema_type
 class ObjectType(BaseModel):
+    """Parameter type for object values.
+
+    :param type: Discriminator type. Always "object"
+    """
+
    type: Literal["object"] = "object"


@json_schema_type
 class JsonType(BaseModel):
+    """Parameter type for JSON values.
+
+    :param type: Discriminator type. Always "json"
+    """
+
    type: Literal["json"] = "json"


@json_schema_type
 class UnionType(BaseModel):
+    """Parameter type for union values.
+
+    :param type: Discriminator type. Always "union"
+    """
+
    type: Literal["union"] = "union"


@json_schema_type
 class ChatCompletionInputType(BaseModel):
+    """Parameter type for chat completion input.
+
+    :param type: Discriminator type. Always "chat_completion_input"
+    """
+
    # expects List[Message] for messages
    type: Literal["chat_completion_input"] = "chat_completion_input"


@json_schema_type
 class CompletionInputType(BaseModel):
+    """Parameter type for completion input.
+
+    :param type: Discriminator type. Always "completion_input"
+    """
+
    # expects InterleavedTextMedia for content
    type: Literal["completion_input"] = "completion_input"


@json_schema_type
 class AgentTurnInputType(BaseModel):
+    """Parameter type for agent turn input.
+
+    :param type: Discriminator type. Always "agent_turn_input"
+    """
+
    # expects List[Message] for messages (may also include attachments?)
    type: Literal["agent_turn_input"] = "agent_turn_input"


@json_schema_type
 class DialogType(BaseModel):
+    """Parameter type for dialog data with semantic output labels.
+
+    :param type: Discriminator type. Always "dialog"
+    """
+
    # expects List[Message] for messages
    # this type semantically contains the output label whereas ChatCompletionInputType does not
    type: Literal["dialog"] = "dialog"
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -94,6 +94,10 @@ register_schema(DataSource, name="DataSource")
 class CommonDatasetFields(BaseModel):
    """
    Common fields for a dataset.
+
+    :param purpose: Purpose of the dataset indicating its intended use
+    :param source: Data source configuration for the dataset
+    :param metadata: Additional metadata for the dataset
    """

    purpose: DatasetPurpose
@ -106,6 +110,11 @@ class CommonDatasetFields(BaseModel):

@json_schema_type
 class Dataset(CommonDatasetFields, Resource):
+    """Dataset resource for storing and accessing training or evaluation data.
+
+    :param type: Type of resource, always 'dataset' for datasets
+    """
+
    type: Literal[ResourceType.dataset] = ResourceType.dataset

    @property
@ -118,10 +127,20 @@ class Dataset(CommonDatasetFields, Resource):


 class DatasetInput(CommonDatasetFields, BaseModel):
+    """Input parameters for dataset operations.
+
+    :param dataset_id: Unique identifier for the dataset
+    """
+
    dataset_id: str


 class ListDatasetsResponse(BaseModel):
+    """Response from listing datasets.
+
+    :param data: List of datasets
+    """
+
    data: list[Dataset]


--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -4,15 +4,106 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from enum import Enum
+from enum import Enum, EnumMeta

-from pydantic import BaseModel
+from pydantic import BaseModel, Field

 from llama_stack.schema_utils import json_schema_type


+class DynamicApiMeta(EnumMeta):
+    def __new__(cls, name, bases, namespace):
+        # Store the original enum values
+        original_values = {k: v for k, v in namespace.items() if not k.startswith("_")}
+
+        # Create the enum class
+        cls = super().__new__(cls, name, bases, namespace)
+
+        # Store the original values for reference
+        cls._original_values = original_values
+        # Initialize _dynamic_values
+        cls._dynamic_values = {}
+
+        return cls
+
+    def __call__(cls, value):
+        try:
+            return super().__call__(value)
+        except ValueError as e:
+            # If this value was already dynamically added, return it
+            if value in cls._dynamic_values:
+                return cls._dynamic_values[value]
+
+            # If the value doesn't exist, create a new enum member
+            # Create a new member name from the value
+            member_name = value.lower().replace("-", "_")
+
+            # If this member name already exists in the enum, return the existing member
+            if member_name in cls._member_map_:
+                return cls._member_map_[member_name]
+
+            # Instead of creating a new member, raise ValueError to force users to use Api.add() to
+            # register new APIs explicitly
+            raise ValueError(f"API '{value}' does not exist. Use Api.add() to register new APIs.") from e
+
+    def __iter__(cls):
+        # Allow iteration over both static and dynamic members
+        yield from super().__iter__()
+        if hasattr(cls, "_dynamic_values"):
+            yield from cls._dynamic_values.values()
+
+    def add(cls, value):
+        """
+        Add a new API to the enum.
+        Used to register external APIs.
+        """
+        member_name = value.lower().replace("-", "_")
+
+        # If this member name already exists in the enum, return it
+        if member_name in cls._member_map_:
+            return cls._member_map_[member_name]
+
+        # Create a new enum member
+        member = object.__new__(cls)
+        member._name_ = member_name
+        member._value_ = value
+
+        # Add it to the enum class
+        cls._member_map_[member_name] = member
+        cls._member_names_.append(member_name)
+        cls._member_type_ = str
+
+        # Store it in our dynamic values
+        cls._dynamic_values[value] = member
+
+        return member
+
+
@json_schema_type
-class Api(Enum):
+class Api(Enum, metaclass=DynamicApiMeta):
+    """Enumeration of all available APIs in the Llama Stack system.
+    :cvar providers: Provider management and configuration
+    :cvar inference: Text generation, chat completions, and embeddings
+    :cvar safety: Content moderation and safety shields
+    :cvar agents: Agent orchestration and execution
+    :cvar vector_io: Vector database operations and queries
+    :cvar datasetio: Dataset input/output operations
+    :cvar scoring: Model output evaluation and scoring
+    :cvar eval: Model evaluation and benchmarking framework
+    :cvar post_training: Fine-tuning and model training
+    :cvar tool_runtime: Tool execution and management
+    :cvar telemetry: Observability and system monitoring
+    :cvar models: Model metadata and management
+    :cvar shields: Safety shield implementations
+    :cvar vector_dbs: Vector database management
+    :cvar datasets: Dataset creation and management
+    :cvar scoring_functions: Scoring function definitions
+    :cvar benchmarks: Benchmark suite management
+    :cvar tool_groups: Tool group organization
+    :cvar files: File storage and management
+    :cvar inspect: Built-in system inspection and introspection
+    """
+
    providers = "providers"
    inference = "inference"
    safety = "safety"
@ -54,3 +145,12 @@ class Error(BaseModel):
    title: str
    detail: str
    instance: str | None = None
+
+
+class ExternalApiSpec(BaseModel):
+    """Specification for an external API implementation."""
+
+    module: str = Field(..., description="Python module containing the API implementation")
+    name: str = Field(..., description="Name of the API")
+    pip_packages: list[str] = Field(default=[], description="List of pip packages to install the API")
+    protocol: str = Field(..., description="Name of the protocol class for the API")
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -54,6 +54,9 @@ class ListOpenAIFileResponse(BaseModel):
    Response for listing files in OpenAI Files API.

    :param data: List of file objects
+    :param has_more: Whether there are more files available beyond this page
+    :param first_id: ID of the first file in the list for pagination
+    :param last_id: ID of the last file in the list for pagination
    :param object: The object type, which is always "list"
    """

--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -41,11 +41,23 @@ from enum import StrEnum

@json_schema_type
 class GreedySamplingStrategy(BaseModel):
+    """Greedy sampling strategy that selects the highest probability token at each step.
+
+    :param type: Must be "greedy" to identify this sampling strategy
+    """
+
    type: Literal["greedy"] = "greedy"


@json_schema_type
 class TopPSamplingStrategy(BaseModel):
+    """Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
+
+    :param type: Must be "top_p" to identify this sampling strategy
+    :param temperature: Controls randomness in sampling. Higher values increase randomness
+    :param top_p: Cumulative probability threshold for nucleus sampling. Defaults to 0.95
+    """
+
    type: Literal["top_p"] = "top_p"
    temperature: float | None = Field(..., gt=0.0)
    top_p: float | None = 0.95
@ -53,6 +65,12 @@ class TopPSamplingStrategy(BaseModel):

@json_schema_type
 class TopKSamplingStrategy(BaseModel):
+    """Top-k sampling strategy that restricts sampling to the k most likely tokens.
+
+    :param type: Must be "top_k" to identify this sampling strategy
+    :param top_k: Number of top tokens to consider for sampling. Must be at least 1
+    """
+
    type: Literal["top_k"] = "top_k"
    top_k: int = Field(..., ge=1)

@ -108,11 +126,21 @@ class QuantizationType(Enum):

@json_schema_type
 class Fp8QuantizationConfig(BaseModel):
+    """Configuration for 8-bit floating point quantization.
+
+    :param type: Must be "fp8_mixed" to identify this quantization type
+    """
+
    type: Literal["fp8_mixed"] = "fp8_mixed"


@json_schema_type
 class Bf16QuantizationConfig(BaseModel):
+    """Configuration for BFloat16 precision (typically no quantization).
+
+    :param type: Must be "bf16" to identify this quantization type
+    """
+
    type: Literal["bf16"] = "bf16"


@ -202,6 +230,14 @@ register_schema(Message, name="Message")

@json_schema_type
 class ToolResponse(BaseModel):
+    """Response from a tool invocation.
+
+    :param call_id: Unique identifier for the tool call this response is for
+    :param tool_name: Name of the tool that was invoked
+    :param content: The response content from the tool
+    :param metadata: (Optional) Additional metadata about the tool response
+    """
+
    call_id: str
    tool_name: BuiltinTool | str
    content: InterleavedContent
@ -439,24 +475,55 @@ class EmbeddingsResponse(BaseModel):

@json_schema_type
 class OpenAIChatCompletionContentPartTextParam(BaseModel):
+    """Text content part for OpenAI-compatible chat completion messages.
+
+    :param type: Must be "text" to identify this as text content
+    :param text: The text content of the message
+    """
+
    type: Literal["text"] = "text"
    text: str


@json_schema_type
 class OpenAIImageURL(BaseModel):
+    """Image URL specification for OpenAI-compatible chat completion messages.
+
+    :param url: URL of the image to include in the message
+    :param detail: (Optional) Level of detail for image processing. Can be "low", "high", or "auto"
+    """
+
    url: str
    detail: str | None = None


@json_schema_type
 class OpenAIChatCompletionContentPartImageParam(BaseModel):
+    """Image content part for OpenAI-compatible chat completion messages.
+
+    :param type: Must be "image_url" to identify this as image content
+    :param image_url: Image URL specification and processing details
+    """
+
    type: Literal["image_url"] = "image_url"
    image_url: OpenAIImageURL


+@json_schema_type
+class OpenAIFileFile(BaseModel):
+    file_data: str | None = None
+    file_id: str | None = None
+    filename: str | None = None
+
+
+@json_schema_type
+class OpenAIFile(BaseModel):
+    type: Literal["file"] = "file"
+    file: OpenAIFileFile
+
+
 OpenAIChatCompletionContentPartParam = Annotated[
-    OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam | OpenAIFile,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIChatCompletionContentPartParam, name="OpenAIChatCompletionContentPartParam")
@ -464,6 +531,8 @@ register_schema(OpenAIChatCompletionContentPartParam, name="OpenAIChatCompletion

 OpenAIChatCompletionMessageContent = str | list[OpenAIChatCompletionContentPartParam]

+OpenAIChatCompletionTextOnlyMessageContent = str | list[OpenAIChatCompletionContentPartTextParam]
+

@json_schema_type
 class OpenAIUserMessageParam(BaseModel):
@ -489,18 +558,32 @@ class OpenAISystemMessageParam(BaseModel):
    """

    role: Literal["system"] = "system"
-    content: OpenAIChatCompletionMessageContent
+    content: OpenAIChatCompletionTextOnlyMessageContent
    name: str | None = None


@json_schema_type
 class OpenAIChatCompletionToolCallFunction(BaseModel):
+    """Function call details for OpenAI-compatible tool calls.
+
+    :param name: (Optional) Name of the function to call
+    :param arguments: (Optional) Arguments to pass to the function as a JSON string
+    """
+
    name: str | None = None
    arguments: str | None = None


@json_schema_type
 class OpenAIChatCompletionToolCall(BaseModel):
+    """Tool call specification for OpenAI-compatible chat completion responses.
+
+    :param index: (Optional) Index of the tool call in the list
+    :param id: (Optional) Unique identifier for the tool call
+    :param type: Must be "function" to identify this as a function call
+    :param function: (Optional) Function call details
+    """
+
    index: int | None = None
    id: str | None = None
    type: Literal["function"] = "function"
@ -518,7 +601,7 @@ class OpenAIAssistantMessageParam(BaseModel):
    """

    role: Literal["assistant"] = "assistant"
-    content: OpenAIChatCompletionMessageContent | None = None
+    content: OpenAIChatCompletionTextOnlyMessageContent | None = None
    name: str | None = None
    tool_calls: list[OpenAIChatCompletionToolCall] | None = None

@ -534,7 +617,7 @@ class OpenAIToolMessageParam(BaseModel):

    role: Literal["tool"] = "tool"
    tool_call_id: str
-    content: OpenAIChatCompletionMessageContent
+    content: OpenAIChatCompletionTextOnlyMessageContent


@json_schema_type
@ -547,7 +630,7 @@ class OpenAIDeveloperMessageParam(BaseModel):
    """

    role: Literal["developer"] = "developer"
-    content: OpenAIChatCompletionMessageContent
+    content: OpenAIChatCompletionTextOnlyMessageContent
    name: str | None = None


@ -564,11 +647,24 @@ register_schema(OpenAIMessageParam, name="OpenAIMessageParam")

@json_schema_type
 class OpenAIResponseFormatText(BaseModel):
+    """Text response format for OpenAI-compatible chat completion requests.
+
+    :param type: Must be "text" to indicate plain text response format
+    """
+
    type: Literal["text"] = "text"


@json_schema_type
 class OpenAIJSONSchema(TypedDict, total=False):
+    """JSON schema specification for OpenAI-compatible structured response format.
+
+    :param name: Name of the schema
+    :param description: (Optional) Description of the schema
+    :param strict: (Optional) Whether to enforce strict adherence to the schema
+    :param schema: (Optional) The JSON schema definition
+    """
+
    name: str
    description: str | None
    strict: bool | None
@ -582,12 +678,23 @@ class OpenAIJSONSchema(TypedDict, total=False):

@json_schema_type
 class OpenAIResponseFormatJSONSchema(BaseModel):
+    """JSON schema response format for OpenAI-compatible chat completion requests.
+
+    :param type: Must be "json_schema" to indicate structured JSON response format
+    :param json_schema: The JSON schema specification for the response
+    """
+
    type: Literal["json_schema"] = "json_schema"
    json_schema: OpenAIJSONSchema


@json_schema_type
 class OpenAIResponseFormatJSONObject(BaseModel):
+    """JSON object response format for OpenAI-compatible chat completion requests.
+
+    :param type: Must be "json_object" to indicate generic JSON object response format
+    """
+
    type: Literal["json_object"] = "json_object"


@ -846,11 +953,21 @@ class EmbeddingTaskType(Enum):

@json_schema_type
 class BatchCompletionResponse(BaseModel):
+    """Response from a batch completion request.
+
+    :param batch: List of completion responses, one for each input in the batch
+    """
+
    batch: list[CompletionResponse]


@json_schema_type
 class BatchChatCompletionResponse(BaseModel):
+    """Response from a batch chat completion request.
+
+    :param batch: List of chat completion responses, one for each conversation in the batch
+    """
+
    batch: list[ChatCompletionResponse]


@ -860,6 +977,15 @@ class OpenAICompletionWithInputMessages(OpenAIChatCompletion):

@json_schema_type
 class ListOpenAIChatCompletionResponse(BaseModel):
+    """Response from listing OpenAI-compatible chat completions.
+
+    :param data: List of chat completion objects with their input messages
+    :param has_more: Whether there are more completions available beyond this list
+    :param first_id: ID of the first completion in this list
+    :param last_id: ID of the last completion in this list
+    :param object: Must be "list" to identify this as a list response
+    """
+
    data: list[OpenAICompletionWithInputMessages]
    has_more: bool
    first_id: str
--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@ -14,6 +14,13 @@ from llama_stack.schema_utils import json_schema_type, webmethod

@json_schema_type
 class RouteInfo(BaseModel):
+    """Information about an API route including its path, method, and implementing providers.
+
+    :param route: The API endpoint path
+    :param method: HTTP method for the route
+    :param provider_types: List of provider types that implement this route
+    """
+
    route: str
    method: str
    provider_types: list[str]
@ -21,15 +28,30 @@ class RouteInfo(BaseModel):

@json_schema_type
 class HealthInfo(BaseModel):
+    """Health status information for the service.
+
+    :param status: Current health status of the service
+    """
+
    status: HealthStatus


@json_schema_type
 class VersionInfo(BaseModel):
+    """Version information for the service.
+
+    :param version: Version number of the service
+    """
+
    version: str


 class ListRoutesResponse(BaseModel):
+    """Response containing a list of all available API routes.
+
+    :param data: List of available route information objects
+    """
+
    data: list[RouteInfo]


@ -37,17 +59,17 @@ class ListRoutesResponse(BaseModel):
 class Inspect(Protocol):
    @webmethod(route="/inspect/routes", method="GET")
    async def list_routes(self) -> ListRoutesResponse:
-        """List all routes.
+        """List all available API routes with their methods and implementing providers.

-        :returns: A ListRoutesResponse.
+        :returns: Response containing information about all available routes.
        """
        ...

    @webmethod(route="/health", method="GET")
    async def health(self) -> HealthInfo:
-        """Get the health of the service.
+        """Get the current health status of the service.

-        :returns: A HealthInfo.
+        :returns: Health information indicating if the service is operational.
        """
        ...

@ -55,6 +77,6 @@ class Inspect(Protocol):
    async def version(self) -> VersionInfo:
        """Get the version of the service.

-        :returns: A VersionInfo.
+        :returns: Version information containing the service version number.
        """
        ...
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -7,7 +7,7 @@
 from enum import StrEnum
 from typing import Any, Literal, Protocol, runtime_checkable

-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator

 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -23,12 +23,27 @@ class CommonModelFields(BaseModel):

@json_schema_type
 class ModelType(StrEnum):
+    """Enumeration of supported model types in Llama Stack.
+    :cvar llm: Large language model for text generation and completion
+    :cvar embedding: Embedding model for converting text to vector representations
+    """
+
    llm = "llm"
    embedding = "embedding"


@json_schema_type
 class Model(CommonModelFields, Resource):
+    """A model resource representing an AI model registered in Llama Stack.
+
+    :param type: The resource type, always 'model' for model resources
+    :param model_type: The type of model (LLM or embedding model)
+    :param metadata: Any additional metadata for this model
+    :param identifier: Unique identifier for this resource in llama stack
+    :param provider_resource_id: Unique identifier for this resource in the provider
+    :param provider_id: ID of the provider that owns this resource
+    """
+
    type: Literal[ResourceType.model] = ResourceType.model

    @property
@ -36,13 +51,21 @@ class Model(CommonModelFields, Resource):
        return self.identifier

    @property
-    def provider_model_id(self) -> str | None:
+    def provider_model_id(self) -> str:
+        assert self.provider_resource_id is not None, "Provider resource ID must be set"
        return self.provider_resource_id

    model_config = ConfigDict(protected_namespaces=())

    model_type: ModelType = Field(default=ModelType.llm)

+    @field_validator("provider_resource_id")
+    @classmethod
+    def validate_provider_resource_id(cls, v):
+        if v is None:
+            raise ValueError("provider_resource_id cannot be None")
+        return v
+

 class ModelInput(CommonModelFields):
    model_id: str
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -18,6 +18,12 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho

@json_schema_type
 class OptimizerType(Enum):
+    """Available optimizer algorithms for training.
+    :cvar adam: Adaptive Moment Estimation optimizer
+    :cvar adamw: AdamW optimizer with weight decay
+    :cvar sgd: Stochastic Gradient Descent optimizer
+    """
+
    adam = "adam"
    adamw = "adamw"
    sgd = "sgd"
@ -25,12 +31,28 @@ class OptimizerType(Enum):

@json_schema_type
 class DatasetFormat(Enum):
+    """Format of the training dataset.
+    :cvar instruct: Instruction-following format with prompt and completion
+    :cvar dialog: Multi-turn conversation format with messages
+    """
+
    instruct = "instruct"
    dialog = "dialog"


@json_schema_type
 class DataConfig(BaseModel):
+    """Configuration for training data and data loading.
+
+    :param dataset_id: Unique identifier for the training dataset
+    :param batch_size: Number of samples per training batch
+    :param shuffle: Whether to shuffle the dataset during training
+    :param data_format: Format of the dataset (instruct or dialog)
+    :param validation_dataset_id: (Optional) Unique identifier for the validation dataset
+    :param packed: (Optional) Whether to pack multiple samples into a single sequence for efficiency
+    :param train_on_input: (Optional) Whether to compute loss on input tokens as well as output tokens
+    """
+
    dataset_id: str
    batch_size: int
    shuffle: bool
@ -42,6 +64,14 @@ class DataConfig(BaseModel):

@json_schema_type
 class OptimizerConfig(BaseModel):
+    """Configuration parameters for the optimization algorithm.
+
+    :param optimizer_type: Type of optimizer to use (adam, adamw, or sgd)
+    :param lr: Learning rate for the optimizer
+    :param weight_decay: Weight decay coefficient for regularization
+    :param num_warmup_steps: Number of steps for learning rate warmup
+    """
+
    optimizer_type: OptimizerType
    lr: float
    weight_decay: float
@ -50,6 +80,14 @@ class OptimizerConfig(BaseModel):

@json_schema_type
 class EfficiencyConfig(BaseModel):
+    """Configuration for memory and compute efficiency optimizations.
+
+    :param enable_activation_checkpointing: (Optional) Whether to use activation checkpointing to reduce memory usage
+    :param enable_activation_offloading: (Optional) Whether to offload activations to CPU to save GPU memory
+    :param memory_efficient_fsdp_wrap: (Optional) Whether to use memory-efficient FSDP wrapping
+    :param fsdp_cpu_offload: (Optional) Whether to offload FSDP parameters to CPU
+    """
+
    enable_activation_checkpointing: bool | None = False
    enable_activation_offloading: bool | None = False
    memory_efficient_fsdp_wrap: bool | None = False
@ -58,6 +96,18 @@ class EfficiencyConfig(BaseModel):

@json_schema_type
 class TrainingConfig(BaseModel):
+    """Comprehensive configuration for the training process.
+
+    :param n_epochs: Number of training epochs to run
+    :param max_steps_per_epoch: Maximum number of steps to run per epoch
+    :param gradient_accumulation_steps: Number of steps to accumulate gradients before updating
+    :param max_validation_steps: (Optional) Maximum number of validation steps per epoch
+    :param data_config: (Optional) Configuration for data loading and formatting
+    :param optimizer_config: (Optional) Configuration for the optimization algorithm
+    :param efficiency_config: (Optional) Configuration for memory and compute optimizations
+    :param dtype: (Optional) Data type for model parameters (bf16, fp16, fp32)
+    """
+
    n_epochs: int
    max_steps_per_epoch: int = 1
    gradient_accumulation_steps: int = 1
@ -70,6 +120,18 @@ class TrainingConfig(BaseModel):

@json_schema_type
 class LoraFinetuningConfig(BaseModel):
+    """Configuration for Low-Rank Adaptation (LoRA) fine-tuning.
+
+    :param type: Algorithm type identifier, always "LoRA"
+    :param lora_attn_modules: List of attention module names to apply LoRA to
+    :param apply_lora_to_mlp: Whether to apply LoRA to MLP layers
+    :param apply_lora_to_output: Whether to apply LoRA to output projection layers
+    :param rank: Rank of the LoRA adaptation (lower rank = fewer parameters)
+    :param alpha: LoRA scaling parameter that controls adaptation strength
+    :param use_dora: (Optional) Whether to use DoRA (Weight-Decomposed Low-Rank Adaptation)
+    :param quantize_base: (Optional) Whether to quantize the base model weights
+    """
+
    type: Literal["LoRA"] = "LoRA"
    lora_attn_modules: list[str]
    apply_lora_to_mlp: bool
@ -82,6 +144,13 @@ class LoraFinetuningConfig(BaseModel):

@json_schema_type
 class QATFinetuningConfig(BaseModel):
+    """Configuration for Quantization-Aware Training (QAT) fine-tuning.
+
+    :param type: Algorithm type identifier, always "QAT"
+    :param quantizer_name: Name of the quantization algorithm to use
+    :param group_size: Size of groups for grouped quantization
+    """
+
    type: Literal["QAT"] = "QAT"
    quantizer_name: str
    group_size: int
@ -93,7 +162,11 @@ register_schema(AlgorithmConfig, name="AlgorithmConfig")

@json_schema_type
 class PostTrainingJobLogStream(BaseModel):
-    """Stream of logs from a finetuning job."""
+    """Stream of logs from a finetuning job.
+
+    :param job_uuid: Unique identifier for the training job
+    :param log_lines: List of log message strings from the training process
+    """

    job_uuid: str
    log_lines: list[str]
@ -101,20 +174,48 @@ class PostTrainingJobLogStream(BaseModel):

@json_schema_type
 class RLHFAlgorithm(Enum):
+    """Available reinforcement learning from human feedback algorithms.
+    :cvar dpo: Direct Preference Optimization algorithm
+    """
+
    dpo = "dpo"


+@json_schema_type
+class DPOLossType(Enum):
+    sigmoid = "sigmoid"
+    hinge = "hinge"
+    ipo = "ipo"
+    kto_pair = "kto_pair"
+
+
@json_schema_type
 class DPOAlignmentConfig(BaseModel):
-    reward_scale: float
-    reward_clip: float
-    epsilon: float
-    gamma: float
+    """Configuration for Direct Preference Optimization (DPO) alignment.
+
+    :param beta: Temperature parameter for the DPO loss
+    :param loss_type: The type of loss function to use for DPO
+    """
+
+    beta: float
+    loss_type: DPOLossType = DPOLossType.sigmoid


@json_schema_type
 class PostTrainingRLHFRequest(BaseModel):
-    """Request to finetune a model."""
+    """Request to finetune a model using reinforcement learning from human feedback.
+
+    :param job_uuid: Unique identifier for the training job
+    :param finetuned_model: URL or path to the base model to fine-tune
+    :param dataset_id: Unique identifier for the training dataset
+    :param validation_dataset_id: Unique identifier for the validation dataset
+    :param algorithm: RLHF algorithm to use for training
+    :param algorithm_config: Configuration parameters for the RLHF algorithm
+    :param optimizer_config: Configuration parameters for the optimization algorithm
+    :param training_config: Configuration parameters for the training process
+    :param hyperparam_search_config: Configuration for hyperparameter search
+    :param logger_config: Configuration for training logging
+    """

    job_uuid: str

@ -140,7 +241,16 @@ class PostTrainingJob(BaseModel):

@json_schema_type
 class PostTrainingJobStatusResponse(BaseModel):
-    """Status of a finetuning job."""
+    """Status of a finetuning job.
+
+    :param job_uuid: Unique identifier for the training job
+    :param status: Current status of the training job
+    :param scheduled_at: (Optional) Timestamp when the job was scheduled
+    :param started_at: (Optional) Timestamp when the job execution began
+    :param completed_at: (Optional) Timestamp when the job finished, if completed
+    :param resources_allocated: (Optional) Information about computational resources allocated to the job
+    :param checkpoints: List of model checkpoints created during training
+    """

    job_uuid: str
    status: JobStatus
@ -160,7 +270,11 @@ class ListPostTrainingJobsResponse(BaseModel):

@json_schema_type
 class PostTrainingJobArtifactsResponse(BaseModel):
-    """Artifacts of a finetuning job."""
+    """Artifacts of a finetuning job.
+
+    :param job_uuid: Unique identifier for the training job
+    :param checkpoints: List of model checkpoints created during training
+    """

    job_uuid: str
    checkpoints: list[Checkpoint] = Field(default_factory=list)
--- a/llama_stack/apis/providers/providers.py
+++ b/llama_stack/apis/providers/providers.py
@ -14,6 +14,15 @@ from llama_stack.schema_utils import json_schema_type, webmethod

@json_schema_type
 class ProviderInfo(BaseModel):
+    """Information about a registered provider including its configuration and health status.
+
+    :param api: The API name this provider implements
+    :param provider_id: Unique identifier for the provider
+    :param provider_type: The type of provider implementation
+    :param config: Configuration parameters for the provider
+    :param health: Current health status of the provider
+    """
+
    api: str
    provider_id: str
    provider_type: str
@ -22,6 +31,11 @@ class ProviderInfo(BaseModel):


 class ListProvidersResponse(BaseModel):
+    """Response containing a list of all available providers.
+
+    :param data: List of provider information objects
+    """
+
    data: list[ProviderInfo]


--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from enum import Enum
+from enum import Enum, StrEnum
 from typing import Any, Protocol, runtime_checkable

 from pydantic import BaseModel, Field
@ -15,8 +15,80 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


+# OpenAI Categories to return in the response
+class OpenAICategories(StrEnum):
+    """
+    Required set of categories in moderations api response
+    """
+
+    VIOLENCE = "violence"
+    VIOLENCE_GRAPHIC = "violence/graphic"
+    HARRASMENT = "harassment"
+    HARRASMENT_THREATENING = "harassment/threatening"
+    HATE = "hate"
+    HATE_THREATENING = "hate/threatening"
+    ILLICIT = "illicit"
+    ILLICIT_VIOLENT = "illicit/violent"
+    SEXUAL = "sexual"
+    SEXUAL_MINORS = "sexual/minors"
+    SELF_HARM = "self-harm"
+    SELF_HARM_INTENT = "self-harm/intent"
+    SELF_HARM_INSTRUCTIONS = "self-harm/instructions"
+
+
+@json_schema_type
+class ModerationObjectResults(BaseModel):
+    """A moderation object.
+    :param flagged: Whether any of the below categories are flagged.
+    :param categories: A list of the categories, and whether they are flagged or not.
+    :param category_applied_input_types: A list of the categories along with the input type(s) that the score applies to.
+    :param category_scores: A list of the categories along with their scores as predicted by model.
+    Required set of categories that need to be in response
+    - violence
+    - violence/graphic
+    - harassment
+    - harassment/threatening
+    - hate
+    - hate/threatening
+    - illicit
+    - illicit/violent
+    - sexual
+    - sexual/minors
+    - self-harm
+    - self-harm/intent
+    - self-harm/instructions
+    """
+
+    flagged: bool
+    categories: dict[str, bool] | None = None
+    category_applied_input_types: dict[str, list[str]] | None = None
+    category_scores: dict[str, float] | None = None
+    user_message: str | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+
+
+@json_schema_type
+class ModerationObject(BaseModel):
+    """A moderation object.
+    :param id: The unique identifier for the moderation request.
+    :param model: The model used to generate the moderation results.
+    :param results: A list of moderation objects
+    """
+
+    id: str
+    model: str
+    results: list[ModerationObjectResults]
+
+
@json_schema_type
 class ViolationLevel(Enum):
+    """Severity level of a safety violation.
+
+    :cvar INFO: Informational level violation that does not require action
+    :cvar WARN: Warning level violation that suggests caution but allows continuation
+    :cvar ERROR: Error level violation that requires blocking or intervention
+    """
+
    INFO = "info"
    WARN = "warn"
    ERROR = "error"
@ -24,6 +96,13 @@ class ViolationLevel(Enum):

@json_schema_type
 class SafetyViolation(BaseModel):
+    """Details of a safety violation detected by content moderation.
+
+    :param violation_level: Severity level of the violation
+    :param user_message: (Optional) Message to convey to the user about the violation
+    :param metadata: Additional metadata including specific violation codes for debugging and telemetry
+    """
+
    violation_level: ViolationLevel

    # what message should you convey to the user
@ -36,6 +115,11 @@ class SafetyViolation(BaseModel):

@json_schema_type
 class RunShieldResponse(BaseModel):
+    """Response from running a safety shield.
+
+    :param violation: (Optional) Safety violation detected by the shield, if any
+    """
+
    violation: SafetyViolation | None = None


@ -63,3 +147,13 @@ class Safety(Protocol):
        :returns: A RunShieldResponse.
        """
        ...
+
+    @webmethod(route="/openai/v1/moderations", method="POST")
+    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
+        """Classifies if text and/or image inputs are potentially harmful.
+        :param input: Input (or inputs) to classify.
+        Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
+        :param model: The content moderation model you would like to use.
+        :returns: A moderation object.
+        """
+        ...
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@ -31,6 +31,12 @@ class ScoringResult(BaseModel):

@json_schema_type
 class ScoreBatchResponse(BaseModel):
+    """Response from batch scoring operations on datasets.
+
+    :param dataset_id: (Optional) The identifier of the dataset that was scored
+    :param results: A map of scoring function name to ScoringResult
+    """
+
    dataset_id: str | None = None
    results: dict[str, ScoringResult]

--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -25,6 +25,12 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
 # with standard metrics so they can be rolled up?
@json_schema_type
 class ScoringFnParamsType(StrEnum):
+    """Types of scoring function parameter configurations.
+    :cvar llm_as_judge: Use an LLM model to evaluate and score responses
+    :cvar regex_parser: Use regex patterns to extract and score specific parts of responses
+    :cvar basic: Basic scoring with simple aggregation functions
+    """
+
    llm_as_judge = "llm_as_judge"
    regex_parser = "regex_parser"
    basic = "basic"
@ -32,6 +38,14 @@ class ScoringFnParamsType(StrEnum):

@json_schema_type
 class AggregationFunctionType(StrEnum):
+    """Types of aggregation functions for scoring results.
+    :cvar average: Calculate the arithmetic mean of scores
+    :cvar weighted_average: Calculate a weighted average of scores
+    :cvar median: Calculate the median value of scores
+    :cvar categorical_count: Count occurrences of categorical values
+    :cvar accuracy: Calculate accuracy as the proportion of correct answers
+    """
+
    average = "average"
    weighted_average = "weighted_average"
    median = "median"
@ -41,6 +55,14 @@ class AggregationFunctionType(StrEnum):

@json_schema_type
 class LLMAsJudgeScoringFnParams(BaseModel):
+    """Parameters for LLM-as-judge scoring function configuration.
+    :param type: The type of scoring function parameters, always llm_as_judge
+    :param judge_model: Identifier of the LLM model to use as a judge for scoring
+    :param prompt_template: (Optional) Custom prompt template for the judge model
+    :param judge_score_regexes: Regexes to extract the answer from generated response
+    :param aggregation_functions: Aggregation functions to apply to the scores of each row
+    """
+
    type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge
    judge_model: str
    prompt_template: str | None = None
@ -56,6 +78,12 @@ class LLMAsJudgeScoringFnParams(BaseModel):

@json_schema_type
 class RegexParserScoringFnParams(BaseModel):
+    """Parameters for regex parser scoring function configuration.
+    :param type: The type of scoring function parameters, always regex_parser
+    :param parsing_regexes: Regex to extract the answer from generated response
+    :param aggregation_functions: Aggregation functions to apply to the scores of each row
+    """
+
    type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser
    parsing_regexes: list[str] = Field(
        description="Regex to extract the answer from generated response",
@ -69,6 +97,11 @@ class RegexParserScoringFnParams(BaseModel):

@json_schema_type
 class BasicScoringFnParams(BaseModel):
+    """Parameters for basic scoring function configuration.
+    :param type: The type of scoring function parameters, always basic
+    :param aggregation_functions: Aggregation functions to apply to the scores of each row
+    """
+
    type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic
    aggregation_functions: list[AggregationFunctionType] = Field(
        description="Aggregation functions to apply to the scores of each row",
@ -100,6 +133,10 @@ class CommonScoringFnFields(BaseModel):

@json_schema_type
 class ScoringFn(CommonScoringFnFields, Resource):
+    """A scoring function resource for evaluating model outputs.
+    :param type: The resource type, always scoring_function
+    """
+
    type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function

    @property
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@ -19,7 +19,11 @@ class CommonShieldFields(BaseModel):

@json_schema_type
 class Shield(CommonShieldFields, Resource):
-    """A safety shield resource that can be used to check content"""
+    """A safety shield resource that can be used to check content.
+
+    :param params: (Optional) Configuration parameters for the shield
+    :param type: The resource type, always shield
+    """

    type: Literal[ResourceType.shield] = ResourceType.shield

@ -79,3 +83,11 @@ class Shields(Protocol):
        :returns: A Shield.
        """
        ...
+
+    @webmethod(route="/shields/{identifier:path}", method="DELETE")
+    async def unregister_shield(self, identifier: str) -> None:
+        """Unregister a shield.
+
+        :param identifier: The identifier of the shield to unregister.
+        """
+        ...
--- a/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
+++ b/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
@ -14,7 +14,15 @@ from llama_stack.schema_utils import json_schema_type, webmethod


 class FilteringFunction(Enum):
-    """The type of filtering function."""
+    """The type of filtering function.
+
+    :cvar none: No filtering applied, accept all generated synthetic data
+    :cvar random: Random sampling of generated data points
+    :cvar top_k: Keep only the top-k highest scoring synthetic data samples
+    :cvar top_p: Nucleus-style filtering, keep samples exceeding cumulative score threshold
+    :cvar top_k_top_p: Combined top-k and top-p filtering strategy
+    :cvar sigmoid: Apply sigmoid function for probability-based filtering
+    """

    none = "none"
    random = "random"
@ -26,7 +34,12 @@ class FilteringFunction(Enum):

@json_schema_type
 class SyntheticDataGenerationRequest(BaseModel):
-    """Request to generate synthetic data. A small batch of prompts and a filtering function"""
+    """Request to generate synthetic data. A small batch of prompts and a filtering function
+
+    :param dialogs: List of conversation messages to use as input for synthetic data generation
+    :param filtering_function: Type of filtering to apply to generated synthetic data samples
+    :param model: (Optional) The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint
+    """

    dialogs: list[Message]
    filtering_function: FilteringFunction = FilteringFunction.none
@ -35,7 +48,11 @@ class SyntheticDataGenerationRequest(BaseModel):

@json_schema_type
 class SyntheticDataGenerationResponse(BaseModel):
-    """Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."""
+    """Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold.
+
+    :param synthetic_data: List of generated synthetic data samples that passed the filtering criteria
+    :param statistics: (Optional) Statistical information about the generation process and filtering results
+    """

    synthetic_data: list[dict[str, Any]]
    statistics: dict[str, Any] | None = None
@ -48,4 +65,12 @@ class SyntheticDataGeneration(Protocol):
        dialogs: list[Message],
        filtering_function: FilteringFunction = FilteringFunction.none,
        model: str | None = None,
-    ) -> SyntheticDataGenerationResponse: ...
+    ) -> SyntheticDataGenerationResponse:
+        """Generate synthetic data based on input dialogs and apply filtering.
+
+        :param dialogs: List of conversation messages to use as input for synthetic data generation
+        :param filtering_function: Type of filtering to apply to generated synthetic data samples
+        :param model: (Optional) The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint
+        :returns: Response containing filtered synthetic data samples and optional statistics
+        """
+        ...
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@ -22,15 +22,32 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
 # Add this constant near the top of the file, after the imports
 DEFAULT_TTL_DAYS = 7

+REQUIRED_SCOPE = "telemetry.read"
+

@json_schema_type
 class SpanStatus(Enum):
+    """The status of a span indicating whether it completed successfully or with an error.
+    :cvar OK: Span completed successfully without errors
+    :cvar ERROR: Span completed with an error or failure
+    """
+
    OK = "ok"
    ERROR = "error"


@json_schema_type
 class Span(BaseModel):
+    """A span representing a single operation within a trace.
+    :param span_id: Unique identifier for the span
+    :param trace_id: Unique identifier for the trace this span belongs to
+    :param parent_span_id: (Optional) Unique identifier for the parent span, if this is a child span
+    :param name: Human-readable name describing the operation this span represents
+    :param start_time: Timestamp when the operation began
+    :param end_time: (Optional) Timestamp when the operation finished, if completed
+    :param attributes: (Optional) Key-value pairs containing additional metadata about the span
+    """
+
    span_id: str
    trace_id: str
    parent_span_id: str | None = None
@ -47,6 +64,13 @@ class Span(BaseModel):

@json_schema_type
 class Trace(BaseModel):
+    """A trace representing the complete execution path of a request across multiple operations.
+    :param trace_id: Unique identifier for the trace
+    :param root_span_id: Unique identifier for the root span that started this trace
+    :param start_time: Timestamp when the trace began
+    :param end_time: (Optional) Timestamp when the trace finished, if completed
+    """
+
    trace_id: str
    root_span_id: str
    start_time: datetime
@ -55,6 +79,12 @@ class Trace(BaseModel):

@json_schema_type
 class EventType(Enum):
+    """The type of telemetry event being logged.
+    :cvar UNSTRUCTURED_LOG: A simple log message with severity level
+    :cvar STRUCTURED_LOG: A structured log event with typed payload data
+    :cvar METRIC: A metric measurement with value and unit
+    """
+
    UNSTRUCTURED_LOG = "unstructured_log"
    STRUCTURED_LOG = "structured_log"
    METRIC = "metric"
@ -62,6 +92,15 @@ class EventType(Enum):

@json_schema_type
 class LogSeverity(Enum):
+    """The severity level of a log message.
+    :cvar VERBOSE: Detailed diagnostic information for troubleshooting
+    :cvar DEBUG: Debug information useful during development
+    :cvar INFO: General informational messages about normal operation
+    :cvar WARN: Warning messages about potentially problematic situations
+    :cvar ERROR: Error messages indicating failures that don't stop execution
+    :cvar CRITICAL: Critical error messages indicating severe failures
+    """
+
    VERBOSE = "verbose"
    DEBUG = "debug"
    INFO = "info"
@ -71,6 +110,13 @@ class LogSeverity(Enum):


 class EventCommon(BaseModel):
+    """Common fields shared by all telemetry events.
+    :param trace_id: Unique identifier for the trace this event belongs to
+    :param span_id: Unique identifier for the span this event belongs to
+    :param timestamp: Timestamp when the event occurred
+    :param attributes: (Optional) Key-value pairs containing additional metadata about the event
+    """
+
    trace_id: str
    span_id: str
    timestamp: datetime
@ -79,6 +125,12 @@ class EventCommon(BaseModel):

@json_schema_type
 class UnstructuredLogEvent(EventCommon):
+    """An unstructured log event containing a simple text message.
+    :param type: Event type identifier set to UNSTRUCTURED_LOG
+    :param message: The log message text
+    :param severity: The severity level of the log message
+    """
+
    type: Literal[EventType.UNSTRUCTURED_LOG] = EventType.UNSTRUCTURED_LOG
    message: str
    severity: LogSeverity
@ -86,6 +138,13 @@ class UnstructuredLogEvent(EventCommon):

@json_schema_type
 class MetricEvent(EventCommon):
+    """A metric event containing a measured value.
+    :param type: Event type identifier set to METRIC
+    :param metric: The name of the metric being measured
+    :param value: The numeric value of the metric measurement
+    :param unit: The unit of measurement for the metric value
+    """
+
    type: Literal[EventType.METRIC] = EventType.METRIC
    metric: str  # this would be an enum
    value: int | float
@ -94,6 +153,12 @@ class MetricEvent(EventCommon):

@json_schema_type
 class MetricInResponse(BaseModel):
+    """A metric value included in API responses.
+    :param metric: The name of the metric
+    :param value: The numeric value of the metric
+    :param unit: (Optional) The unit of measurement for the metric value
+    """
+
    metric: str
    value: int | float
    unit: str | None = None
@ -120,17 +185,32 @@ class MetricInResponse(BaseModel):


 class MetricResponseMixin(BaseModel):
+    """Mixin class for API responses that can include metrics.
+    :param metrics: (Optional) List of metrics associated with the API response
+    """
+
    metrics: list[MetricInResponse] | None = None


@json_schema_type
 class StructuredLogType(Enum):
+    """The type of structured log event payload.
+    :cvar SPAN_START: Event indicating the start of a new span
+    :cvar SPAN_END: Event indicating the completion of a span
+    """
+
    SPAN_START = "span_start"
    SPAN_END = "span_end"


@json_schema_type
 class SpanStartPayload(BaseModel):
+    """Payload for a span start event.
+    :param type: Payload type identifier set to SPAN_START
+    :param name: Human-readable name describing the operation this span represents
+    :param parent_span_id: (Optional) Unique identifier for the parent span, if this is a child span
+    """
+
    type: Literal[StructuredLogType.SPAN_START] = StructuredLogType.SPAN_START
    name: str
    parent_span_id: str | None = None
@ -138,6 +218,11 @@ class SpanStartPayload(BaseModel):

@json_schema_type
 class SpanEndPayload(BaseModel):
+    """Payload for a span end event.
+    :param type: Payload type identifier set to SPAN_END
+    :param status: The final status of the span indicating success or failure
+    """
+
    type: Literal[StructuredLogType.SPAN_END] = StructuredLogType.SPAN_END
    status: SpanStatus

@ -151,6 +236,11 @@ register_schema(StructuredLogPayload, name="StructuredLogPayload")

@json_schema_type
 class StructuredLogEvent(EventCommon):
+    """A structured log event containing typed payload data.
+    :param type: Event type identifier set to STRUCTURED_LOG
+    :param payload: The structured payload data for the log event
+    """
+
    type: Literal[EventType.STRUCTURED_LOG] = EventType.STRUCTURED_LOG
    payload: StructuredLogPayload

@ -164,6 +254,14 @@ register_schema(Event, name="Event")

@json_schema_type
 class EvalTrace(BaseModel):
+    """A trace record for evaluation purposes.
+    :param session_id: Unique identifier for the evaluation session
+    :param step: The evaluation step or phase identifier
+    :param input: The input data for the evaluation
+    :param output: The actual output produced during evaluation
+    :param expected_output: The expected output for comparison during evaluation
+    """
+
    session_id: str
    step: str
    input: str
@ -173,11 +271,22 @@ class EvalTrace(BaseModel):

@json_schema_type
 class SpanWithStatus(Span):
+    """A span that includes status information.
+    :param status: (Optional) The current status of the span
+    """
+
    status: SpanStatus | None = None


@json_schema_type
 class QueryConditionOp(Enum):
+    """Comparison operators for query conditions.
+    :cvar EQ: Equal to comparison
+    :cvar NE: Not equal to comparison
+    :cvar GT: Greater than comparison
+    :cvar LT: Less than comparison
+    """
+
    EQ = "eq"
    NE = "ne"
    GT = "gt"
@ -186,29 +295,59 @@ class QueryConditionOp(Enum):

@json_schema_type
 class QueryCondition(BaseModel):
+    """A condition for filtering query results.
+    :param key: The attribute key to filter on
+    :param op: The comparison operator to apply
+    :param value: The value to compare against
+    """
+
    key: str
    op: QueryConditionOp
    value: Any


 class QueryTracesResponse(BaseModel):
+    """Response containing a list of traces.
+    :param data: List of traces matching the query criteria
+    """
+
    data: list[Trace]


 class QuerySpansResponse(BaseModel):
+    """Response containing a list of spans.
+    :param data: List of spans matching the query criteria
+    """
+
    data: list[Span]


 class QuerySpanTreeResponse(BaseModel):
+    """Response containing a tree structure of spans.
+    :param data: Dictionary mapping span IDs to spans with status information
+    """
+
    data: dict[str, SpanWithStatus]


 class MetricQueryType(Enum):
+    """The type of metric query to perform.
+    :cvar RANGE: Query metrics over a time range
+    :cvar INSTANT: Query metrics at a specific point in time
+    """
+
    RANGE = "range"
    INSTANT = "instant"


 class MetricLabelOperator(Enum):
+    """Operators for matching metric labels.
+    :cvar EQUALS: Label value must equal the specified value
+    :cvar NOT_EQUALS: Label value must not equal the specified value
+    :cvar REGEX_MATCH: Label value must match the specified regular expression
+    :cvar REGEX_NOT_MATCH: Label value must not match the specified regular expression
+    """
+
    EQUALS = "="
    NOT_EQUALS = "!="
    REGEX_MATCH = "=~"
@ -216,6 +355,12 @@ class MetricLabelOperator(Enum):


 class MetricLabelMatcher(BaseModel):
+    """A matcher for filtering metrics by label values.
+    :param name: The name of the label to match
+    :param value: The value to match against
+    :param operator: The comparison operator to use for matching
+    """
+
    name: str
    value: str
    operator: MetricLabelOperator = MetricLabelOperator.EQUALS
@ -223,24 +368,44 @@ class MetricLabelMatcher(BaseModel):

@json_schema_type
 class MetricLabel(BaseModel):
+    """A label associated with a metric.
+    :param name: The name of the label
+    :param value: The value of the label
+    """
+
    name: str
    value: str


@json_schema_type
 class MetricDataPoint(BaseModel):
+    """A single data point in a metric time series.
+    :param timestamp: Unix timestamp when the metric value was recorded
+    :param value: The numeric value of the metric at this timestamp
+    """
+
    timestamp: int
    value: float


@json_schema_type
 class MetricSeries(BaseModel):
+    """A time series of metric data points.
+    :param metric: The name of the metric
+    :param labels: List of labels associated with this metric series
+    :param values: List of data points in chronological order
+    """
+
    metric: str
    labels: list[MetricLabel]
    values: list[MetricDataPoint]


 class QueryMetricsResponse(BaseModel):
+    """Response containing metric time series data.
+    :param data: List of metric series matching the query criteria
+    """
+
    data: list[MetricSeries]


@ -259,7 +424,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/traces", method="POST")
+    @webmethod(route="/telemetry/traces", method="POST", required_scope=REQUIRED_SCOPE)
    async def query_traces(
        self,
        attribute_filters: list[QueryCondition] | None = None,
@ -277,7 +442,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET")
+    @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET", required_scope=REQUIRED_SCOPE)
    async def get_trace(self, trace_id: str) -> Trace:
        """Get a trace by its ID.

@ -286,7 +451,9 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET")
+    @webmethod(
+        route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET", required_scope=REQUIRED_SCOPE
+    )
    async def get_span(self, trace_id: str, span_id: str) -> Span:
        """Get a span by its ID.

@ -296,7 +463,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="POST")
+    @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="POST", required_scope=REQUIRED_SCOPE)
    async def get_span_tree(
        self,
        span_id: str,
@ -312,7 +479,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/spans", method="POST")
+    @webmethod(route="/telemetry/spans", method="POST", required_scope=REQUIRED_SCOPE)
    async def query_spans(
        self,
        attribute_filters: list[QueryCondition],
@ -345,7 +512,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/metrics/{metric_name}", method="POST")
+    @webmethod(route="/telemetry/metrics/{metric_name}", method="POST", required_scope=REQUIRED_SCOPE)
    async def query_metrics(
        self,
        metric_name: str,
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from enum import Enum
+from enum import Enum, StrEnum
 from typing import Annotated, Any, Literal, Protocol

 from pydantic import BaseModel, Field, field_validator
@ -22,7 +22,7 @@ class RRFRanker(BaseModel):

    :param type: The type of ranker, always "rrf"
    :param impact_factor: The impact factor for RRF scoring. Higher values give more weight to higher-ranked results.
-                         Must be greater than 0. Default of 60 is from the original RRF paper (Cormack et al., 2009).
+                         Must be greater than 0
    """

    type: Literal["rrf"] = "rrf"
@ -76,25 +76,65 @@ class RAGDocument(BaseModel):

@json_schema_type
 class RAGQueryResult(BaseModel):
+    """Result of a RAG query containing retrieved content and metadata.
+
+    :param content: (Optional) The retrieved content from the query
+    :param metadata: Additional metadata about the query result
+    """
+
    content: InterleavedContent | None = None
    metadata: dict[str, Any] = Field(default_factory=dict)


@json_schema_type
 class RAGQueryGenerator(Enum):
+    """Types of query generators for RAG systems.
+
+    :cvar default: Default query generator using simple text processing
+    :cvar llm: LLM-based query generator for enhanced query understanding
+    :cvar custom: Custom query generator implementation
+    """
+
    default = "default"
    llm = "llm"
    custom = "custom"


+@json_schema_type
+class RAGSearchMode(StrEnum):
+    """
+    Search modes for RAG query retrieval:
+    - VECTOR: Uses vector similarity search for semantic matching
+    - KEYWORD: Uses keyword-based search for exact matching
+    - HYBRID: Combines both vector and keyword search for better results
+    """
+
+    VECTOR = "vector"
+    KEYWORD = "keyword"
+    HYBRID = "hybrid"
+
+
@json_schema_type
 class DefaultRAGQueryGeneratorConfig(BaseModel):
+    """Configuration for the default RAG query generator.
+
+    :param type: Type of query generator, always 'default'
+    :param separator: String separator used to join query terms
+    """
+
    type: Literal["default"] = "default"
    separator: str = " "


@json_schema_type
 class LLMRAGQueryGeneratorConfig(BaseModel):
+    """Configuration for the LLM-based RAG query generator.
+
+    :param type: Type of query generator, always 'llm'
+    :param model: Name of the language model to use for query generation
+    :param template: Template string for formatting the query generation prompt
+    """
+
    type: Literal["llm"] = "llm"
    model: str
    template: str
@ -128,7 +168,7 @@ class RAGQueryConfig(BaseModel):
    max_tokens_in_context: int = 4096
    max_chunks: int = 5
    chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
-    mode: str | None = None
+    mode: RAGSearchMode | None = RAGSearchMode.VECTOR
    ranker: Ranker | None = Field(default=None)  # Only used for hybrid mode

    @field_validator("chunk_template")
@ -152,7 +192,12 @@ class RAGToolRuntime(Protocol):
        vector_db_id: str,
        chunk_size_in_tokens: int = 512,
    ) -> None:
-        """Index documents so they can be used by the RAG system"""
+        """Index documents so they can be used by the RAG system.
+
+        :param documents: List of documents to index in the RAG system
+        :param vector_db_id: ID of the vector database to store the document embeddings
+        :param chunk_size_in_tokens: (Optional) Size in tokens for document chunking during indexing
+        """
        ...

    @webmethod(route="/tool-runtime/rag-tool/query", method="POST")
@ -162,5 +207,11 @@ class RAGToolRuntime(Protocol):
        vector_db_ids: list[str],
        query_config: RAGQueryConfig | None = None,
    ) -> RAGQueryResult:
-        """Query the RAG system for context; typically invoked by the agent"""
+        """Query the RAG system for context; typically invoked by the agent.
+
+        :param content: The query content to search for in the indexed documents
+        :param vector_db_ids: List of vector database IDs to search within
+        :param query_config: (Optional) Configuration parameters for the query operation
+        :returns: RAGQueryResult containing the retrieved content and metadata
+        """
        ...
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@ -20,6 +20,15 @@ from .rag_tool import RAGToolRuntime

@json_schema_type
 class ToolParameter(BaseModel):
+    """Parameter definition for a tool.
+
+    :param name: Name of the parameter
+    :param parameter_type: Type of the parameter (e.g., string, integer)
+    :param description: Human-readable description of what the parameter does
+    :param required: Whether this parameter is required for tool invocation
+    :param default: (Optional) Default value for the parameter if not provided
+    """
+
    name: str
    parameter_type: str
    description: str
@ -29,6 +38,15 @@ class ToolParameter(BaseModel):

@json_schema_type
 class Tool(Resource):
+    """A tool that can be invoked by agents.
+
+    :param type: Type of resource, always 'tool'
+    :param toolgroup_id: ID of the tool group this tool belongs to
+    :param description: Human-readable description of what the tool does
+    :param parameters: List of parameters this tool accepts
+    :param metadata: (Optional) Additional metadata about the tool
+    """
+
    type: Literal[ResourceType.tool] = ResourceType.tool
    toolgroup_id: str
    description: str
@ -38,6 +56,14 @@ class Tool(Resource):

@json_schema_type
 class ToolDef(BaseModel):
+    """Tool definition used in runtime contexts.
+
+    :param name: Name of the tool
+    :param description: (Optional) Human-readable description of what the tool does
+    :param parameters: (Optional) List of parameters this tool accepts
+    :param metadata: (Optional) Additional metadata about the tool
+    """
+
    name: str
    description: str | None = None
    parameters: list[ToolParameter] | None = None
@ -46,6 +72,14 @@ class ToolDef(BaseModel):

@json_schema_type
 class ToolGroupInput(BaseModel):
+    """Input data for registering a tool group.
+
+    :param toolgroup_id: Unique identifier for the tool group
+    :param provider_id: ID of the provider that will handle this tool group
+    :param args: (Optional) Additional arguments to pass to the provider
+    :param mcp_endpoint: (Optional) Model Context Protocol endpoint for remote tools
+    """
+
    toolgroup_id: str
    provider_id: str
    args: dict[str, Any] | None = None
@ -54,6 +88,13 @@ class ToolGroupInput(BaseModel):

@json_schema_type
 class ToolGroup(Resource):
+    """A group of related tools managed together.
+
+    :param type: Type of resource, always 'tool_group'
+    :param mcp_endpoint: (Optional) Model Context Protocol endpoint for remote tools
+    :param args: (Optional) Additional arguments for the tool group
+    """
+
    type: Literal[ResourceType.tool_group] = ResourceType.tool_group
    mcp_endpoint: URL | None = None
    args: dict[str, Any] | None = None
@ -61,6 +102,14 @@ class ToolGroup(Resource):

@json_schema_type
 class ToolInvocationResult(BaseModel):
+    """Result of a tool invocation.
+
+    :param content: (Optional) The output content from the tool execution
+    :param error_message: (Optional) Error message if the tool execution failed
+    :param error_code: (Optional) Numeric error code if the tool execution failed
+    :param metadata: (Optional) Additional metadata about the tool execution
+    """
+
    content: InterleavedContent | None = None
    error_message: str | None = None
    error_code: int | None = None
@ -73,14 +122,29 @@ class ToolStore(Protocol):


 class ListToolGroupsResponse(BaseModel):
+    """Response containing a list of tool groups.
+
+    :param data: List of tool groups
+    """
+
    data: list[ToolGroup]


 class ListToolsResponse(BaseModel):
+    """Response containing a list of tools.
+
+    :param data: List of tools
+    """
+
    data: list[Tool]


 class ListToolDefsResponse(BaseModel):
+    """Response containing a list of tool definitions.
+
+    :param data: List of tool definitions
+    """
+
    data: list[ToolDef]


@ -158,6 +222,11 @@ class ToolGroups(Protocol):


 class SpecialToolGroup(Enum):
+    """Special tool groups with predefined functionality.
+
+    :cvar rag_tool: Retrieval-Augmented Generation tool group for document search and retrieval
+    """
+
    rag_tool = "rag_tool"


--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@ -15,10 +15,18 @@ from llama_stack.schema_utils import json_schema_type, webmethod

@json_schema_type
 class VectorDB(Resource):
+    """Vector database resource for storing and querying vector embeddings.
+
+    :param type: Type of resource, always 'vector_db' for vector databases
+    :param embedding_model: Name of the embedding model to use for vector generation
+    :param embedding_dimension: Dimension of the embedding vectors
+    """
+
    type: Literal[ResourceType.vector_db] = ResourceType.vector_db

    embedding_model: str
    embedding_dimension: int
+    vector_db_name: str | None = None

    @property
    def vector_db_id(self) -> str:
@ -30,13 +38,27 @@ class VectorDB(Resource):


 class VectorDBInput(BaseModel):
+    """Input parameters for creating or configuring a vector database.
+
+    :param vector_db_id: Unique identifier for the vector database
+    :param embedding_model: Name of the embedding model to use for vector generation
+    :param embedding_dimension: Dimension of the embedding vectors
+    :param provider_vector_db_id: (Optional) Provider-specific identifier for the vector database
+    """
+
    vector_db_id: str
    embedding_model: str
    embedding_dimension: int
+    provider_id: str | None = None
    provider_vector_db_id: str | None = None


 class ListVectorDBsResponse(BaseModel):
+    """Response from listing vector databases.
+
+    :param data: List of vector databases
+    """
+
    data: list[VectorDB]


@ -70,6 +92,7 @@ class VectorDBs(Protocol):
        embedding_model: str,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
+        vector_db_name: str | None = None,
        provider_vector_db_id: str | None = None,
    ) -> VectorDB:
        """Register a vector database.
@ -78,6 +101,7 @@ class VectorDBs(Protocol):
        :param embedding_model: The embedding model to use.
        :param embedding_dimension: The dimension of the embedding model.
        :param provider_id: The identifier of the provider.
+        :param vector_db_name: The name of the vector database.
        :param provider_vector_db_id: The identifier of the vector database in the provider.
        :returns: A VectorDB.
        """
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -16,7 +16,7 @@ from pydantic import BaseModel, Field
 from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
-from llama_stack.providers.utils.vector_io.chunk_utils import generate_chunk_id
+from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
 from llama_stack.schema_utils import json_schema_type, webmethod
 from llama_stack.strong_typing.schema import register_schema

@ -94,12 +94,27 @@ class Chunk(BaseModel):

@json_schema_type
 class QueryChunksResponse(BaseModel):
+    """Response from querying chunks in a vector database.
+
+    :param chunks: List of content chunks returned from the query
+    :param scores: Relevance scores corresponding to each returned chunk
+    """
+
    chunks: list[Chunk]
    scores: list[float]


@json_schema_type
 class VectorStoreFileCounts(BaseModel):
+    """File processing status counts for a vector store.
+
+    :param completed: Number of files that have been successfully processed
+    :param cancelled: Number of files that had their processing cancelled
+    :param failed: Number of files that failed to process
+    :param in_progress: Number of files currently being processed
+    :param total: Total number of files in the vector store
+    """
+
    completed: int
    cancelled: int
    failed: int
@ -109,7 +124,20 @@ class VectorStoreFileCounts(BaseModel):

@json_schema_type
 class VectorStoreObject(BaseModel):
-    """OpenAI Vector Store object."""
+    """OpenAI Vector Store object.
+
+    :param id: Unique identifier for the vector store
+    :param object: Object type identifier, always "vector_store"
+    :param created_at: Timestamp when the vector store was created
+    :param name: (Optional) Name of the vector store
+    :param usage_bytes: Storage space used by the vector store in bytes
+    :param file_counts: File processing status counts for the vector store
+    :param status: Current status of the vector store
+    :param expires_after: (Optional) Expiration policy for the vector store
+    :param expires_at: (Optional) Timestamp when the vector store will expire
+    :param last_active_at: (Optional) Timestamp of last activity on the vector store
+    :param metadata: Set of key-value pairs that can be attached to the vector store
+    """

    id: str
    object: str = "vector_store"
@ -126,7 +154,14 @@ class VectorStoreObject(BaseModel):

@json_schema_type
 class VectorStoreCreateRequest(BaseModel):
-    """Request to create a vector store."""
+    """Request to create a vector store.
+
+    :param name: (Optional) Name for the vector store
+    :param file_ids: List of file IDs to include in the vector store
+    :param expires_after: (Optional) Expiration policy for the vector store
+    :param chunking_strategy: (Optional) Strategy for splitting files into chunks
+    :param metadata: Set of key-value pairs that can be attached to the vector store
+    """

    name: str | None = None
    file_ids: list[str] = Field(default_factory=list)
@ -137,7 +172,12 @@ class VectorStoreCreateRequest(BaseModel):

@json_schema_type
 class VectorStoreModifyRequest(BaseModel):
-    """Request to modify a vector store."""
+    """Request to modify a vector store.
+
+    :param name: (Optional) Updated name for the vector store
+    :param expires_after: (Optional) Updated expiration policy for the vector store
+    :param metadata: (Optional) Updated set of key-value pairs for the vector store
+    """

    name: str | None = None
    expires_after: dict[str, Any] | None = None
@ -146,7 +186,14 @@ class VectorStoreModifyRequest(BaseModel):

@json_schema_type
 class VectorStoreListResponse(BaseModel):
-    """Response from listing vector stores."""
+    """Response from listing vector stores.
+
+    :param object: Object type identifier, always "list"
+    :param data: List of vector store objects
+    :param first_id: (Optional) ID of the first vector store in the list for pagination
+    :param last_id: (Optional) ID of the last vector store in the list for pagination
+    :param has_more: Whether there are more vector stores available beyond this page
+    """

    object: str = "list"
    data: list[VectorStoreObject]
@ -157,7 +204,14 @@ class VectorStoreListResponse(BaseModel):

@json_schema_type
 class VectorStoreSearchRequest(BaseModel):
-    """Request to search a vector store."""
+    """Request to search a vector store.
+
+    :param query: Search query as a string or list of strings
+    :param filters: (Optional) Filters based on file attributes to narrow search results
+    :param max_num_results: Maximum number of results to return, defaults to 10
+    :param ranking_options: (Optional) Options for ranking and filtering search results
+    :param rewrite_query: Whether to rewrite the query for better vector search performance
+    """

    query: str | list[str]
    filters: dict[str, Any] | None = None
@ -168,13 +222,26 @@ class VectorStoreSearchRequest(BaseModel):

@json_schema_type
 class VectorStoreContent(BaseModel):
+    """Content item from a vector store file or search result.
+
+    :param type: Content type, currently only "text" is supported
+    :param text: The actual text content
+    """
+
    type: Literal["text"]
    text: str


@json_schema_type
 class VectorStoreSearchResponse(BaseModel):
-    """Response from searching a vector store."""
+    """Response from searching a vector store.
+
+    :param file_id: Unique identifier of the file containing the result
+    :param filename: Name of the file containing the result
+    :param score: Relevance score for this search result
+    :param attributes: (Optional) Key-value attributes associated with the file
+    :param content: List of content items matching the search query
+    """

    file_id: str
    filename: str
@ -185,7 +252,14 @@ class VectorStoreSearchResponse(BaseModel):

@json_schema_type
 class VectorStoreSearchResponsePage(BaseModel):
-    """Response from searching a vector store."""
+    """Paginated response from searching a vector store.
+
+    :param object: Object type identifier for the search results page
+    :param search_query: The original search query that was executed
+    :param data: List of search result objects
+    :param has_more: Whether there are more results available beyond this page
+    :param next_page: (Optional) Token for retrieving the next page of results
+    """

    object: str = "vector_store.search_results.page"
    search_query: str
@ -196,7 +270,12 @@ class VectorStoreSearchResponsePage(BaseModel):

@json_schema_type
 class VectorStoreDeleteResponse(BaseModel):
-    """Response from deleting a vector store."""
+    """Response from deleting a vector store.
+
+    :param id: Unique identifier of the deleted vector store
+    :param object: Object type identifier for the deletion response
+    :param deleted: Whether the deletion operation was successful
+    """

    id: str
    object: str = "vector_store.deleted"
@ -205,17 +284,34 @@ class VectorStoreDeleteResponse(BaseModel):

@json_schema_type
 class VectorStoreChunkingStrategyAuto(BaseModel):
+    """Automatic chunking strategy for vector store files.
+
+    :param type: Strategy type, always "auto" for automatic chunking
+    """
+
    type: Literal["auto"] = "auto"


@json_schema_type
 class VectorStoreChunkingStrategyStaticConfig(BaseModel):
+    """Configuration for static chunking strategy.
+
+    :param chunk_overlap_tokens: Number of tokens to overlap between adjacent chunks
+    :param max_chunk_size_tokens: Maximum number of tokens per chunk, must be between 100 and 4096
+    """
+
    chunk_overlap_tokens: int = 400
    max_chunk_size_tokens: int = Field(800, ge=100, le=4096)


@json_schema_type
 class VectorStoreChunkingStrategyStatic(BaseModel):
+    """Static chunking strategy with configurable parameters.
+
+    :param type: Strategy type, always "static" for static chunking
+    :param static: Configuration parameters for the static chunking strategy
+    """
+
    type: Literal["static"] = "static"
    static: VectorStoreChunkingStrategyStaticConfig

@ -227,6 +323,12 @@ register_schema(VectorStoreChunkingStrategy, name="VectorStoreChunkingStrategy")


 class SearchRankingOptions(BaseModel):
+    """Options for ranking and filtering search results.
+
+    :param ranker: (Optional) Name of the ranking algorithm to use
+    :param score_threshold: (Optional) Minimum relevance score threshold for results
+    """
+
    ranker: str | None = None
    # NOTE: OpenAI File Search Tool requires threshold to be between 0 and 1, however
    # we don't guarantee that the score is between 0 and 1, so will leave this unconstrained
@ -236,6 +338,12 @@ class SearchRankingOptions(BaseModel):

@json_schema_type
 class VectorStoreFileLastError(BaseModel):
+    """Error information for failed vector store file processing.
+
+    :param code: Error code indicating the type of failure
+    :param message: Human-readable error message describing the failure
+    """
+
    code: Literal["server_error"] | Literal["rate_limit_exceeded"]
    message: str

@ -246,7 +354,18 @@ register_schema(VectorStoreFileStatus, name="VectorStoreFileStatus")

@json_schema_type
 class VectorStoreFileObject(BaseModel):
-    """OpenAI Vector Store File object."""
+    """OpenAI Vector Store File object.
+
+    :param id: Unique identifier for the file
+    :param object: Object type identifier, always "vector_store.file"
+    :param attributes: Key-value attributes associated with the file
+    :param chunking_strategy: Strategy used for splitting the file into chunks
+    :param created_at: Timestamp when the file was added to the vector store
+    :param last_error: (Optional) Error information if file processing failed
+    :param status: Current processing status of the file
+    :param usage_bytes: Storage space used by this file in bytes
+    :param vector_store_id: ID of the vector store containing this file
+    """

    id: str
    object: str = "vector_store.file"
@ -261,7 +380,14 @@ class VectorStoreFileObject(BaseModel):

@json_schema_type
 class VectorStoreListFilesResponse(BaseModel):
-    """Response from listing vector stores."""
+    """Response from listing files in a vector store.
+
+    :param object: Object type identifier, always "list"
+    :param data: List of vector store file objects
+    :param first_id: (Optional) ID of the first file in the list for pagination
+    :param last_id: (Optional) ID of the last file in the list for pagination
+    :param has_more: Whether there are more files available beyond this page
+    """

    object: str = "list"
    data: list[VectorStoreFileObject]
@ -272,7 +398,13 @@ class VectorStoreListFilesResponse(BaseModel):

@json_schema_type
 class VectorStoreFileContentsResponse(BaseModel):
-    """Response from retrieving the contents of a vector store file."""
+    """Response from retrieving the contents of a vector store file.
+
+    :param file_id: Unique identifier for the file
+    :param filename: Name of the file
+    :param attributes: Key-value attributes associated with the file
+    :param content: List of content items from the file
+    """

    file_id: str
    filename: str
@ -282,7 +414,12 @@ class VectorStoreFileContentsResponse(BaseModel):

@json_schema_type
 class VectorStoreFileDeleteResponse(BaseModel):
-    """Response from deleting a vector store file."""
+    """Response from deleting a vector store file.
+
+    :param id: Unique identifier of the deleted file
+    :param object: Object type identifier for the deletion response
+    :param deleted: Whether the deletion operation was successful
+    """

    id: str
    object: str = "vector_store.file.deleted"
@ -338,7 +475,7 @@ class VectorIO(Protocol):
    @webmethod(route="/openai/v1/vector_stores", method="POST")
    async def openai_create_vector_store(
        self,
-        name: str,
+        name: str | None = None,
        file_ids: list[str] | None = None,
        expires_after: dict[str, Any] | None = None,
        chunking_strategy: dict[str, Any] | None = None,
@ -346,7 +483,6 @@ class VectorIO(Protocol):
        embedding_model: str | None = None,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
-        provider_vector_db_id: str | None = None,
    ) -> VectorStoreObject:
        """Creates a vector store.

@ -358,7 +494,6 @@ class VectorIO(Protocol):
        :param embedding_model: The embedding model to use for this vector store.
        :param embedding_dimension: The dimension of the embedding vectors (default: 384).
        :param provider_id: The ID of the provider to use for this vector store.
-        :param provider_vector_db_id: The provider-specific vector database ID.
        :returns: A VectorStoreObject representing the created vector store.
        """
        ...
@ -480,6 +615,11 @@ class VectorIO(Protocol):
        """List files in a vector store.

        :param vector_store_id: The ID of the vector store to list files from.
+        :param limit: (Optional) A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
+        :param order: (Optional) Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.
+        :param after: (Optional) A cursor for use in pagination. `after` is an object ID that defines your place in the list.
+        :param before: (Optional) A cursor for use in pagination. `before` is an object ID that defines your place in the list.
+        :param filter: (Optional) Filter by file status to only return files with the specified status.
        :returns: A VectorStoreListFilesResponse containing the list of files.
        """
        ...
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -323,7 +323,7 @@ def _hf_download(
    from huggingface_hub import snapshot_download
    from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError

-    from llama_stack.distribution.utils.model_utils import model_local_dir
+    from llama_stack.core.utils.model_utils import model_local_dir

    repo_id = model.huggingface_repo
    if repo_id is None:
@ -361,7 +361,7 @@ def _meta_download(
    info: "LlamaDownloadInfo",
    max_concurrent_downloads: int,
 ):
-    from llama_stack.distribution.utils.model_utils import model_local_dir
+    from llama_stack.core.utils.model_utils import model_local_dir

    output_dir = Path(model_local_dir(model.descriptor()))
    os.makedirs(output_dir, exist_ok=True)
@ -403,7 +403,7 @@ class Manifest(BaseModel):


 def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
-    from llama_stack.distribution.utils.model_utils import model_local_dir
+    from llama_stack.core.utils.model_utils import model_local_dir

    with open(manifest_file) as f:
        d = json.load(f)
--- a/llama_stack/cli/model/list.py
+++ b/llama_stack/cli/model/list.py
@ -11,7 +11,7 @@ from pathlib import Path

 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.cli.table import print_table
-from llama_stack.distribution.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
+from llama_stack.core.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
 from llama_stack.models.llama.sku_list import all_registered_models


--- a/llama_stack/cli/model/remove.py
+++ b/llama_stack/cli/model/remove.py
@ -9,7 +9,7 @@ import os
 import shutil

 from llama_stack.cli.subcommand import Subcommand
-from llama_stack.distribution.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
+from llama_stack.core.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
 from llama_stack.models.llama.sku_list import resolve_model


--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -23,77 +23,86 @@ from termcolor import colored, cprint

 from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.table import print_table
-from llama_stack.distribution.build import (
+from llama_stack.core.build import (
    SERVER_DEPENDENCIES,
    build_image,
    get_provider_dependencies,
 )
-from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
-from llama_stack.distribution.datatypes import (
+from llama_stack.core.configure import parse_and_maybe_upgrade_config
+from llama_stack.core.datatypes import (
    BuildConfig,
+    BuildProvider,
    DistributionSpec,
    Provider,
    StackRunConfig,
 )
-from llama_stack.distribution.distribution import get_provider_registry
-from llama_stack.distribution.resolver import InvalidProviderError
-from llama_stack.distribution.stack import replace_env_vars
-from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR, EXTERNAL_PROVIDERS_DIR
-from llama_stack.distribution.utils.dynamic import instantiate_class_type
-from llama_stack.distribution.utils.exec import formulate_run_args, run_command
-from llama_stack.distribution.utils.image_types import LlamaStackImageType
+from llama_stack.core.distribution import get_provider_registry
+from llama_stack.core.external import load_external_apis
+from llama_stack.core.resolver import InvalidProviderError
+from llama_stack.core.stack import replace_env_vars
+from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR, EXTERNAL_PROVIDERS_DIR
+from llama_stack.core.utils.dynamic import instantiate_class_type
+from llama_stack.core.utils.exec import formulate_run_args, run_command
+from llama_stack.core.utils.image_types import LlamaStackImageType
 from llama_stack.providers.datatypes import Api

-TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates"
+DISTRIBS_PATH = Path(__file__).parent.parent.parent / "distributions"


@lru_cache
-def available_templates_specs() -> dict[str, BuildConfig]:
+def available_distros_specs() -> dict[str, BuildConfig]:
    import yaml

-    template_specs = {}
-    for p in TEMPLATES_PATH.rglob("*build.yaml"):
-        template_name = p.parent.name
+    distro_specs = {}
+    for p in DISTRIBS_PATH.rglob("*build.yaml"):
+        distro_name = p.parent.name
        with open(p) as f:
            build_config = BuildConfig(**yaml.safe_load(f))
-            template_specs[template_name] = build_config
-    return template_specs
+            distro_specs[distro_name] = build_config
+    return distro_specs


 def run_stack_build_command(args: argparse.Namespace) -> None:
-    if args.list_templates:
-        return _run_template_list_cmd()
+    if args.list_distros:
+        return _run_distro_list_cmd()

    if args.image_type == ImageType.VENV.value:
        current_venv = os.environ.get("VIRTUAL_ENV")
        image_name = args.image_name or current_venv
-    elif args.image_type == ImageType.CONDA.value:
-        current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
-        image_name = args.image_name or current_conda_env
    else:
        image_name = args.image_name

    if args.template:
-        available_templates = available_templates_specs()
-        if args.template not in available_templates:
+        cprint(
+            "The --template argument is deprecated. Please use --distro instead.",
+            color="red",
+            file=sys.stderr,
+        )
+        distro_name = args.template
+    else:
+        distro_name = args.distribution
+
+    if distro_name:
+        available_distros = available_distros_specs()
+        if distro_name not in available_distros:
            cprint(
-                f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates",
+                f"Could not find distribution {distro_name}. Please run `llama stack build --list-distros` to check out the available distributions",
                color="red",
                file=sys.stderr,
            )
            sys.exit(1)
-        build_config = available_templates[args.template]
+        build_config = available_distros[distro_name]
        if args.image_type:
            build_config.image_type = args.image_type
        else:
            cprint(
-                f"Please specify a image-type ({' | '.join(e.value for e in ImageType)}) for {args.template}",
+                f"Please specify a image-type ({' | '.join(e.value for e in ImageType)}) for {distro_name}",
                color="red",
                file=sys.stderr,
            )
            sys.exit(1)
    elif args.providers:
-        providers_list: dict[str, str | list[str]] = dict()
+        provider_list: dict[str, list[BuildProvider]] = dict()
        for api_provider in args.providers.split(","):
            if "=" not in api_provider:
                cprint(
@ -102,7 +111,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                    file=sys.stderr,
                )
                sys.exit(1)
-            api, provider = api_provider.split("=")
+            api, provider_type = api_provider.split("=")
            providers_for_api = get_provider_registry().get(Api(api), None)
            if providers_for_api is None:
                cprint(
@ -111,16 +120,12 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                    file=sys.stderr,
                )
                sys.exit(1)
-            if provider in providers_for_api:
-                if api not in providers_list:
-                    providers_list[api] = []
-                # Use type guarding to ensure we have a list
-                provider_value = providers_list[api]
-                if isinstance(provider_value, list):
-                    provider_value.append(provider)
-                else:
-                    # Convert string to list and append
-                    providers_list[api] = [provider_value, provider]
+            if provider_type in providers_for_api:
+                provider = BuildProvider(
+                    provider_type=provider_type,
+                    module=None,
+                )
+                provider_list.setdefault(api, []).append(provider)
            else:
                cprint(
                    f"{provider} is not a valid provider for the {api} API.",
@ -129,19 +134,19 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                )
                sys.exit(1)
        distribution_spec = DistributionSpec(
-            providers=providers_list,
+            providers=provider_list,
            description=",".join(args.providers),
        )
        if not args.image_type:
            cprint(
-                f"Please specify a image-type (container | conda | venv) for {args.template}",
+                f"Please specify a image-type (container | venv) for {args.template}",
                color="red",
                file=sys.stderr,
            )
            sys.exit(1)

        build_config = BuildConfig(image_type=args.image_type, distribution_spec=distribution_spec)
-    elif not args.config and not args.template:
+    elif not args.config and not distro_name:
        name = prompt(
            "> Enter a name for your Llama Stack (e.g. my-local-stack): ",
            validator=Validator.from_callable(
@ -160,22 +165,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
            ),
        )

-        if image_type == ImageType.CONDA.value:
-            if not image_name:
-                cprint(
-                    f"No current conda environment detected or specified, will create a new conda environment with the name `llamastack-{name}`",
-                    color="yellow",
-                    file=sys.stderr,
-                )
-                image_name = f"llamastack-{name}"
-            else:
-                cprint(
-                    f"Using conda environment {image_name}",
-                    color="green",
-                    file=sys.stderr,
-                )
-        else:
-            image_name = f"llamastack-{name}"
+        image_name = f"llamastack-{name}"

        cprint(
            textwrap.dedent(
@ -190,7 +180,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:

        cprint("Tip: use <TAB> to see options for the providers.\n", color="green", file=sys.stderr)

-        providers: dict[str, str | list[str]] = dict()
+        providers: dict[str, list[BuildProvider]] = dict()
        for api, providers_for_api in get_provider_registry().items():
            available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
            if not available_providers:
@ -205,7 +195,10 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                ),
            )

-            providers[api.value] = api_provider
+            string_providers = api_provider.split(" ")
+
+            for provider in string_providers:
+                providers.setdefault(api.value, []).append(BuildProvider(provider_type=provider))

        description = prompt(
            "\n > (Optional) Enter a short description for your Llama Stack: ",
@ -235,12 +228,14 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                sys.exit(1)

    if args.print_deps_only:
-        print(f"# Dependencies for {args.template or args.config or image_name}")
-        normal_deps, special_deps = get_provider_dependencies(build_config)
+        print(f"# Dependencies for {distro_name or args.config or image_name}")
+        normal_deps, special_deps, external_provider_dependencies = get_provider_dependencies(build_config)
        normal_deps += SERVER_DEPENDENCIES
        print(f"uv pip install {' '.join(normal_deps)}")
        for special_dep in special_deps:
            print(f"uv pip install {special_dep}")
+        for external_dep in external_provider_dependencies:
+            print(f"uv pip install {external_dep}")
        return

    try:
@ -248,7 +243,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
            build_config,
            image_name=image_name,
            config_path=args.config,
-            template_name=args.template,
+            distro_name=distro_name,
        )

    except (Exception, RuntimeError) as exc:
@ -276,8 +271,8 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
        config = parse_and_maybe_upgrade_config(config_dict)
        if config.external_providers_dir and not config.external_providers_dir.exists():
            config.external_providers_dir.mkdir(exist_ok=True)
-        run_args = formulate_run_args(args.image_type, args.image_name, config, args.template)
-        run_args.extend([str(os.getenv("LLAMA_STACK_PORT", 8321)), "--config", run_config])
+        run_args = formulate_run_args(args.image_type, image_name or config.image_name)
+        run_args.extend([str(os.getenv("LLAMA_STACK_PORT", 8321)), "--config", str(run_config)])
        run_command(run_args)


@ -303,27 +298,25 @@ def _generate_run_config(
    provider_registry = get_provider_registry(build_config)
    for api in apis:
        run_config.providers[api] = []
-        provider_types = build_config.distribution_spec.providers[api]
-        if isinstance(provider_types, str):
-            provider_types = [provider_types]
+        providers = build_config.distribution_spec.providers[api]

-        for i, provider_type in enumerate(provider_types):
-            pid = provider_type.split("::")[-1]
+        for provider in providers:
+            pid = provider.provider_type.split("::")[-1]

-            p = provider_registry[Api(api)][provider_type]
+            p = provider_registry[Api(api)][provider.provider_type]
            if p.deprecation_error:
                raise InvalidProviderError(p.deprecation_error)

            try:
-                config_type = instantiate_class_type(provider_registry[Api(api)][provider_type].config_class)
-            except ModuleNotFoundError:
+                config_type = instantiate_class_type(provider_registry[Api(api)][provider.provider_type].config_class)
+            except (ModuleNotFoundError, ValueError) as exc:
                # HACK ALERT:
                # This code executes after building is done, the import cannot work since the
                # package is either available in the venv or container - not available on the host.
                # TODO: use a "is_external" flag in ProviderSpec to check if the provider is
                # external
                cprint(
-                    f"Failed to import provider {provider_type} for API {api} - assuming it's external, skipping",
+                    f"Failed to import provider {provider.provider_type} for API {api} - assuming it's external, skipping: {exc}",
                    color="yellow",
                    file=sys.stderr,
                )
@ -336,9 +329,10 @@ def _generate_run_config(
                config = {}

            p_spec = Provider(
-                provider_id=f"{pid}-{i}" if len(provider_types) > 1 else pid,
-                provider_type=provider_type,
+                provider_id=pid,
+                provider_type=provider.provider_type,
                config=config,
+                module=provider.module,
            )
            run_config.providers[api].append(p_spec)

@ -360,20 +354,17 @@ def _generate_run_config(
 def _run_stack_build_command_from_build_config(
    build_config: BuildConfig,
    image_name: str | None = None,
-    template_name: str | None = None,
+    distro_name: str | None = None,
    config_path: str | None = None,
 ) -> Path | Traversable:
    image_name = image_name or build_config.image_name
    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
-        if template_name:
-            image_name = f"distribution-{template_name}"
+        if distro_name:
+            image_name = f"distribution-{distro_name}"
        else:
            if not image_name:
                raise ValueError("Please specify an image name when building a container image without a template")
-    elif build_config.image_type == LlamaStackImageType.CONDA.value:
-        if not image_name:
-            raise ValueError("Please specify an image name when building a conda image")
-    elif build_config.image_type == LlamaStackImageType.VENV.value:
+    else:
        if not image_name and os.environ.get("UV_SYSTEM_PYTHON"):
            image_name = "__system__"
        if not image_name:
@ -383,9 +374,9 @@ def _run_stack_build_command_from_build_config(
    if image_name is None:
        raise ValueError("image_name should not be None after validation")

-    if template_name:
-        build_dir = DISTRIBS_BASE_DIR / template_name
-        build_file_path = build_dir / f"{template_name}-build.yaml"
+    if distro_name:
+        build_dir = DISTRIBS_BASE_DIR / distro_name
+        build_file_path = build_dir / f"{distro_name}-build.yaml"
    else:
        if image_name is None:
            raise ValueError("image_name cannot be None")
@ -396,58 +387,79 @@ def _run_stack_build_command_from_build_config(
    run_config_file = None
    # Generate the run.yaml so it can be included in the container image with the proper entrypoint
    # Only do this if we're building a container image and we're not using a template
-    if build_config.image_type == LlamaStackImageType.CONTAINER.value and not template_name and config_path:
+    if build_config.image_type == LlamaStackImageType.CONTAINER.value and not distro_name and config_path:
        cprint("Generating run.yaml file", color="yellow", file=sys.stderr)
        run_config_file = _generate_run_config(build_config, build_dir, image_name)

    with open(build_file_path, "w") as f:
-        to_write = json.loads(build_config.model_dump_json())
+        to_write = json.loads(build_config.model_dump_json(exclude_none=True))
        f.write(yaml.dump(to_write, sort_keys=False))

+    # We first install the external APIs so that the build process can use them and discover the
+    # providers dependencies
+    if build_config.external_apis_dir:
+        cprint("Installing external APIs", color="yellow", file=sys.stderr)
+        external_apis = load_external_apis(build_config)
+        if external_apis:
+            # install the external APIs
+            packages = []
+            for _, api_spec in external_apis.items():
+                if api_spec.pip_packages:
+                    packages.extend(api_spec.pip_packages)
+                    cprint(
+                        f"Installing {api_spec.name} with pip packages {api_spec.pip_packages}",
+                        color="yellow",
+                        file=sys.stderr,
+                    )
+            return_code = run_command(["uv", "pip", "install", *packages])
+            if return_code != 0:
+                packages_str = ", ".join(packages)
+                raise RuntimeError(
+                    f"Failed to install external APIs packages: {packages_str} (return code: {return_code})"
+                )
+
    return_code = build_image(
        build_config,
-        build_file_path,
        image_name,
-        template_or_config=template_name or config_path or str(build_file_path),
+        distro_or_config=distro_name or config_path or str(build_file_path),
        run_config=run_config_file.as_posix() if run_config_file else None,
    )
    if return_code != 0:
        raise RuntimeError(f"Failed to build image {image_name}")

-    if template_name:
-        # copy run.yaml from template to build_dir instead of generating it again
-        template_path = importlib.resources.files("llama_stack") / f"templates/{template_name}/run.yaml"
-        run_config_file = build_dir / f"{template_name}-run.yaml"
+    if distro_name:
+        # copy run.yaml from distribution to build_dir instead of generating it again
+        distro_path = importlib.resources.files("llama_stack") / f"distributions/{distro_name}/run.yaml"
+        run_config_file = build_dir / f"{distro_name}-run.yaml"

-        with importlib.resources.as_file(template_path) as path:
+        with importlib.resources.as_file(distro_path) as path:
            shutil.copy(path, run_config_file)

        cprint("Build Successful!", color="green", file=sys.stderr)
-        cprint(f"You can find the newly-built template here: {run_config_file}", color="blue", file=sys.stderr)
+        cprint(f"You can find the newly-built distribution here: {run_config_file}", color="blue", file=sys.stderr)
        cprint(
            "You can run the new Llama Stack distro via: "
            + colored(f"llama stack run {run_config_file} --image-type {build_config.image_type}", "blue"),
            color="green",
            file=sys.stderr,
        )
-        return template_path
+        return distro_path
    else:
        return _generate_run_config(build_config, build_dir, image_name)


-def _run_template_list_cmd() -> None:
-    # eventually, this should query a registry at llama.meta.com/llamastack/distributions
+def _run_distro_list_cmd() -> None:
    headers = [
-        "Template Name",
+        "Distribution Name",
        # "Providers",
        "Description",
    ]

    rows = []
-    for template_name, spec in available_templates_specs().items():
+    for distro_name, spec in available_distros_specs().items():
        rows.append(
            [
-                template_name,
+                distro_name,
                # json.dumps(spec.distribution_spec.providers, indent=2),
                spec.distribution_spec.description,
            ]
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@ -27,21 +27,31 @@ class StackBuild(Subcommand):
            "--config",
            type=str,
            default=None,
-            help="Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml. If this argument is not provided, you will be prompted to enter information interactively",
+            help="Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to enter information interactively",
        )

        self.parser.add_argument(
            "--template",
            type=str,
            default=None,
-            help="Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates",
+            help="""(deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions""",
+        )
+        self.parser.add_argument(
+            "--distro",
+            "--distribution",
+            dest="distribution",
+            type=str,
+            default=None,
+            help="""Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions""",
        )

        self.parser.add_argument(
-            "--list-templates",
+            "--list-distros",
+            "--list-distributions",
            action="store_true",
+            dest="list_distros",
            default=False,
-            help="Show the available templates for building a Llama Stack distribution",
+            help="Show the available distributions for building a Llama Stack distribution",
        )

        self.parser.add_argument(
@ -56,7 +66,7 @@ class StackBuild(Subcommand):
            "--image-name",
            type=str,
            help=textwrap.dedent(
-                f"""[for image-type={"|".join(e.value for e in ImageType)}] Name of the conda or virtual environment to use for
+                f"""[for image-type={"|".join(e.value for e in ImageType)}] Name of the virtual environment to use for
 the build. If not specified, currently active environment will be used if found.
            """
            ),
--- a/llama_stack/cli/stack/list_apis.py
+++ b/llama_stack/cli/stack/list_apis.py
@ -26,7 +26,7 @@ class StackListApis(Subcommand):

    def _run_apis_list_cmd(self, args: argparse.Namespace) -> None:
        from llama_stack.cli.table import print_table
-        from llama_stack.distribution.distribution import stack_apis
+        from llama_stack.core.distribution import stack_apis

        # eventually, this should query a registry at llama.meta.com/llamastack/distributions
        headers = [
--- a/llama_stack/cli/stack/list_providers.py
+++ b/llama_stack/cli/stack/list_providers.py
@ -23,7 +23,7 @@ class StackListProviders(Subcommand):

    @property
    def providable_apis(self):
-        from llama_stack.distribution.distribution import providable_apis
+        from llama_stack.core.distribution import providable_apis

        return [api.value for api in providable_apis()]

@ -38,7 +38,7 @@ class StackListProviders(Subcommand):

    def _run_providers_list_cmd(self, args: argparse.Namespace) -> None:
        from llama_stack.cli.table import print_table
-        from llama_stack.distribution.distribution import Api, get_provider_registry
+        from llama_stack.core.distribution import Api, get_provider_registry

        all_providers = get_provider_registry()
        if args.api:
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -35,8 +35,8 @@ class StackRun(Subcommand):
            "config",
            type=str,
            nargs="?",  # Make it optional
-            metavar="config | template",
-            help="Path to config file to use for the run or name of known template (`llama stack list` for a list).",
+            metavar="config | distro",
+            help="Path to config file to use for the run or name of known distro (`llama stack list` for a list).",
        )
        self.parser.add_argument(
            "--port",
@ -47,7 +47,7 @@ class StackRun(Subcommand):
        self.parser.add_argument(
            "--image-name",
            type=str,
-            default=os.environ.get("CONDA_DEFAULT_ENV"),
+            default=None,
            help="Name of the image to run. Defaults to the current environment",
        )
        self.parser.add_argument(
@ -59,7 +59,7 @@ class StackRun(Subcommand):
        self.parser.add_argument(
            "--image-type",
            type=str,
-            help="Image Type used during the build. This can be either conda or container or venv.",
+            help="Image Type used during the build. This can be only venv.",
            choices=[e.value for e in ImageType if e.value != ImageType.CONTAINER.value],
        )
        self.parser.add_argument(
@ -68,37 +68,22 @@ class StackRun(Subcommand):
            help="Start the UI server",
        )

-    # If neither image type nor image name is provided, but at the same time
-    # the current environment has conda breadcrumbs, then assume what the user
-    # wants to use conda mode and not the usual default mode (using
-    # pre-installed system packages).
-    #
-    # Note: yes, this is hacky. It's implemented this way to keep the existing
-    # conda users unaffected by the switch of the default behavior to using
-    # system packages.
-    def _get_image_type_and_name(self, args: argparse.Namespace) -> tuple[str, str]:
-        conda_env = os.environ.get("CONDA_DEFAULT_ENV")
-        if conda_env and args.image_name == conda_env:
-            logger.warning(f"Conda detected. Using conda environment {conda_env} for the run.")
-            return ImageType.CONDA.value, args.image_name
-        return args.image_type, args.image_name
-
-    def _resolve_config_and_template(self, args: argparse.Namespace) -> tuple[Path | None, str | None]:
-        """Resolve config file path and template name from args.config"""
-        from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
+    def _resolve_config_and_distro(self, args: argparse.Namespace) -> tuple[Path | None, str | None]:
+        """Resolve config file path and distribution name from args.config"""
+        from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR

        if not args.config:
            return None, None

        config_file = Path(args.config)
        has_yaml_suffix = args.config.endswith(".yaml")
-        template_name = None
+        distro_name = None

        if not config_file.exists() and not has_yaml_suffix:
-            # check if this is a template
-            config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
+            # check if this is a distribution
+            config_file = Path(REPO_ROOT) / "llama_stack" / "distributions" / args.config / "run.yaml"
            if config_file.exists():
-                template_name = args.config
+                distro_name = args.config

        if not config_file.exists() and not has_yaml_suffix:
            # check if it's a build config saved to ~/.llama dir
@ -114,24 +99,31 @@ class StackRun(Subcommand):
                f"Config file must be a valid file path, '{config_file}' is not a file: type={type(config_file)}"
            )

-        return config_file, template_name
+        return config_file, distro_name

    def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
        import yaml

-        from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
-        from llama_stack.distribution.utils.exec import formulate_run_args, run_command
+        from llama_stack.core.configure import parse_and_maybe_upgrade_config
+        from llama_stack.core.utils.exec import formulate_run_args, run_command

        if args.enable_ui:
            self._start_ui_development_server(args.port)
-        image_type, image_name = self._get_image_type_and_name(args)
+        image_type, image_name = args.image_type, args.image_name

-        # Resolve config file and template name first
-        config_file, template_name = self._resolve_config_and_template(args)
+        if args.config:
+            try:
+                from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
+
+                config_file = resolve_config_or_distro(args.config, Mode.RUN)
+            except ValueError as e:
+                self.parser.error(str(e))
+        else:
+            config_file = None

        # Check if config is required based on image type
-        if (image_type in [ImageType.CONDA.value, ImageType.VENV.value]) and not config_file:
-            self.parser.error("Config file is required for venv and conda environments")
+        if image_type == ImageType.VENV.value and not config_file:
+            self.parser.error("Config file is required for venv environment")

        if config_file:
            logger.info(f"Using run configuration: {config_file}")
@ -154,7 +146,7 @@ class StackRun(Subcommand):
        # using the current environment packages.
        if not image_type and not image_name:
            logger.info("No image type or image name provided. Assuming environment packages.")
-            from llama_stack.distribution.server.server import main as server_main
+            from llama_stack.core.server.server import main as server_main

            # Build the server args from the current args passed to the CLI
            server_args = argparse.Namespace()
@ -165,18 +157,14 @@ class StackRun(Subcommand):
                if callable(getattr(args, arg)):
                    continue
                if arg == "config":
-                    if template_name:
-                        server_args.template = str(template_name)
-                    else:
-                        # Set the config file path
-                        server_args.config = str(config_file)
+                    server_args.config = str(config_file)
                else:
                    setattr(server_args, arg, getattr(args, arg))

            # Run the server
            server_main(server_args)
        else:
-            run_args = formulate_run_args(image_type, image_name, config, template_name)
+            run_args = formulate_run_args(image_type, image_name)

            run_args.extend([str(args.port)])

--- a/llama_stack/cli/stack/utils.py
+++ b/llama_stack/cli/stack/utils.py
@ -8,7 +8,6 @@ from enum import Enum


 class ImageType(Enum):
-    CONDA = "conda"
    CONTAINER = "container"
    VENV = "venv"

--- a/llama_stack/cli/utils.py
+++ b/llama_stack/cli/utils.py
@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="cli")
+
+
+# TODO: this can probably just be inlined now?
+def add_config_distro_args(parser: argparse.ArgumentParser):
+    """Add unified config/distro arguments."""
+    group = parser.add_mutually_exclusive_group(required=True)
+
+    group.add_argument(
+        "config",
+        nargs="?",
+        help="Configuration file path or distribution name",
+    )
+
+
+def get_config_from_args(args: argparse.Namespace) -> str | None:
+    if args.config is not None:
+        return str(args.config)
+    return None
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@ -107,7 +107,7 @@ def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -


 def run_verify_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
-    from llama_stack.distribution.utils.model_utils import model_local_dir
+    from llama_stack.core.utils.model_utils import model_local_dir

    console = Console()
    model_dir = Path(model_local_dir(args.model_id))
--- a/llama_stack/distribution/init.py
+++ b/llama_stack/distribution/init.py
--- a/llama_stack/distribution/access_control/init.py
+++ b/llama_stack/distribution/access_control/init.py
--- a/llama_stack/distribution/access_control/access_control.py
+++ b/llama_stack/distribution/access_control/access_control.py
@ -6,7 +6,7 @@

 from typing import Any

-from llama_stack.distribution.datatypes import User
+from llama_stack.core.datatypes import User

 from .conditions import (
    Condition,
--- a/llama_stack/distribution/access_control/conditions.py
+++ b/llama_stack/distribution/access_control/conditions.py
--- a/llama_stack/distribution/access_control/datatypes.py
+++ b/llama_stack/distribution/access_control/datatypes.py
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@ -7,17 +7,17 @@
 import importlib.resources
 import logging
 import sys
-from pathlib import Path

 from pydantic import BaseModel
 from termcolor import cprint

-from llama_stack.distribution.datatypes import BuildConfig
-from llama_stack.distribution.distribution import get_provider_registry
-from llama_stack.distribution.utils.exec import run_command
-from llama_stack.distribution.utils.image_types import LlamaStackImageType
+from llama_stack.core.datatypes import BuildConfig
+from llama_stack.core.distribution import get_provider_registry
+from llama_stack.core.external import load_external_apis
+from llama_stack.core.utils.exec import run_command
+from llama_stack.core.utils.image_types import LlamaStackImageType
+from llama_stack.distributions.template import DistributionTemplate
 from llama_stack.providers.datatypes import Api
-from llama_stack.templates.template import DistributionTemplate

 log = logging.getLogger(__name__)

@ -41,7 +41,7 @@ class ApiInput(BaseModel):

 def get_provider_dependencies(
    config: BuildConfig | DistributionTemplate,
-) -> tuple[list[str], list[str]]:
+) -> tuple[list[str], list[str], list[str]]:
    """Get normal and special dependencies from provider configuration."""
    if isinstance(config, DistributionTemplate):
        config = config.build_config()
@ -50,6 +50,7 @@ def get_provider_dependencies(
    additional_pip_packages = config.additional_pip_packages

    deps = []
+    external_provider_deps = []
    registry = get_provider_registry(config)
    for api_str, provider_or_providers in providers.items():
        providers_for_api = registry[Api(api_str)]
@ -64,8 +65,16 @@ def get_provider_dependencies(
                raise ValueError(f"Provider `{provider}` is not available for API `{api_str}`")

            provider_spec = providers_for_api[provider_type]
-            deps.extend(provider_spec.pip_packages)
-            if provider_spec.container_image:
+            if hasattr(provider_spec, "is_external") and provider_spec.is_external:
+                # this ensures we install the top level module for our external providers
+                if provider_spec.module:
+                    if isinstance(provider_spec.module, str):
+                        external_provider_deps.append(provider_spec.module)
+                    else:
+                        external_provider_deps.extend(provider_spec.module)
+            if hasattr(provider_spec, "pip_packages"):
+                deps.extend(provider_spec.pip_packages)
+            if hasattr(provider_spec, "container_image") and provider_spec.container_image:
                raise ValueError("A stack's dependencies cannot have a container image")

    normal_deps = []
@ -78,7 +87,7 @@ def get_provider_dependencies(

    normal_deps.extend(additional_pip_packages or [])

-    return list(set(normal_deps)), list(set(special_deps))
+    return list(set(normal_deps)), list(set(special_deps)), list(set(external_provider_deps))


 def print_pip_install_help(config: BuildConfig):
@ -96,48 +105,54 @@ def print_pip_install_help(config: BuildConfig):

 def build_image(
    build_config: BuildConfig,
-    build_file_path: Path,
    image_name: str,
-    template_or_config: str,
+    distro_or_config: str,
    run_config: str | None = None,
 ):
    container_base = build_config.distribution_spec.container_image or "python:3.12-slim"

-    normal_deps, special_deps = get_provider_dependencies(build_config)
+    normal_deps, special_deps, external_provider_deps = get_provider_dependencies(build_config)
    normal_deps += SERVER_DEPENDENCIES
+    if build_config.external_apis_dir:
+        external_apis = load_external_apis(build_config)
+        if external_apis:
+            for _, api_spec in external_apis.items():
+                normal_deps.extend(api_spec.pip_packages)

    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
-        script = str(importlib.resources.files("llama_stack") / "distribution/build_container.sh")
+        script = str(importlib.resources.files("llama_stack") / "core/build_container.sh")
        args = [
            script,
-            template_or_config,
+            "--distro-or-config",
+            distro_or_config,
+            "--image-name",
            image_name,
+            "--container-base",
            container_base,
+            "--normal-deps",
            " ".join(normal_deps),
        ]
-
        # When building from a config file (not a template), include the run config path in the
        # build arguments
        if run_config is not None:
-            args.append(run_config)
-    elif build_config.image_type == LlamaStackImageType.CONDA.value:
-        script = str(importlib.resources.files("llama_stack") / "distribution/build_conda_env.sh")
-        args = [
-            script,
-            str(image_name),
-            str(build_file_path),
-            " ".join(normal_deps),
-        ]
-    elif build_config.image_type == LlamaStackImageType.VENV.value:
-        script = str(importlib.resources.files("llama_stack") / "distribution/build_venv.sh")
+            args.extend(["--run-config", run_config])
+    else:
+        script = str(importlib.resources.files("llama_stack") / "core/build_venv.sh")
        args = [
            script,
+            "--env-name",
            str(image_name),
+            "--normal-deps",
            " ".join(normal_deps),
        ]

+    # Always pass both arguments, even if empty, to maintain consistent positional arguments
    if special_deps:
-        args.append("#".join(special_deps))
+        args.extend(["--optional-deps", "#".join(special_deps)])
+    if external_provider_deps:
+        args.extend(
+            ["--external-provider-deps", "#".join(external_provider_deps)]
+        )  # the script will install external provider module, get its deps, and install those too.

    return_code = run_command(args)

--- a/llama_stack/distribution/build_conda_env.sh
+++ b/llama_stack/distribution/build_conda_env.sh
@ -9,10 +9,91 @@
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
+PYPI_VERSION=${PYPI_VERSION:-}
 # This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}

+set -euo pipefail
+
+# Define color codes
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+source "$SCRIPT_DIR/common.sh"
+
+# Usage function
+usage() {
+  echo "Usage: $0 --env-name <conda_env_name> --build-file-path <build_file_path> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
+  echo "Example: $0 --env-name my-conda-env --build-file-path ./my-stack-build.yaml --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
+  exit 1
+}
+
+# Parse arguments
+env_name=""
+build_file_path=""
+normal_deps=""
+external_provider_deps=""
+optional_deps=""
+
+while [[ $# -gt 0 ]]; do
+  key="$1"
+  case "$key" in
+    --env-name)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --env-name requires a string value" >&2
+        usage
+      fi
+      env_name="$2"
+      shift 2
+      ;;
+    --build-file-path)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --build-file-path requires a string value" >&2
+        usage
+      fi
+      build_file_path="$2"
+      shift 2
+      ;;
+    --normal-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --normal-deps requires a string value" >&2
+        usage
+      fi
+      normal_deps="$2"
+      shift 2
+      ;;
+    --external-provider-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --external-provider-deps requires a string value" >&2
+        usage
+      fi
+      external_provider_deps="$2"
+      shift 2
+      ;;
+    --optional-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --optional-deps requires a string value" >&2
+        usage
+      fi
+      optional_deps="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      ;;
+  esac
+done
+
+# Check required arguments
+if [[ -z "$env_name" || -z "$build_file_path" || -z "$normal_deps" ]]; then
+  echo "Error: --env-name, --build-file-path, and --normal-deps are required." >&2
+  usage
+fi
+
 if [ -n "$LLAMA_STACK_DIR" ]; then
  echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
 fi
@ -20,50 +101,18 @@ if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
  echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
 fi

-if [ "$#" -lt 3 ]; then
-  echo "Usage: $0 <distribution_type> <conda_env_name> <build_file_path> <pip_dependencies> [<special_pip_deps>]" >&2
-  echo "Example: $0 <distribution_type> my-conda-env ./my-stack-build.yaml 'numpy pandas scipy'" >&2
-  exit 1
-fi
-
-special_pip_deps="$4"
-
-set -euo pipefail
-
-env_name="$1"
-build_file_path="$2"
-pip_dependencies="$3"
-
-# Define color codes
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-NC='\033[0m' # No Color
-
-# this is set if we actually create a new conda in which case we need to clean up
-ENVNAME=""
-
-SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
-source "$SCRIPT_DIR/common.sh"
-
 ensure_conda_env_python310() {
-  local env_name="$1"
-  local pip_dependencies="$2"
-  local special_pip_deps="$3"
+  # Use only global variables set by flag parser
  local python_version="3.12"

-  # Check if conda command is available
  if ! is_command_available conda; then
    printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
    exit 1
  fi

-  # Check if the environment exists
  if conda env list | grep -q "^${env_name} "; then
    printf "Conda environment '${env_name}' exists. Checking Python version...\n"
-
-    # Check Python version in the environment
    current_version=$(conda run -n "${env_name}" python --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
-
    if [ "$current_version" = "$python_version" ]; then
      printf "Environment '${env_name}' already has Python ${python_version}. No action needed.\n"
    else
@ -73,37 +122,37 @@ ensure_conda_env_python310() {
  else
    printf "Conda environment '${env_name}' does not exist. Creating with Python ${python_version}...\n"
    conda create -n "${env_name}" python="${python_version}" -y
-
-    ENVNAME="${env_name}"
-    # setup_cleanup_handlers
  fi

  eval "$(conda shell.bash hook)"
  conda deactivate && conda activate "${env_name}"
-
  "$CONDA_PREFIX"/bin/pip install uv

  if [ -n "$TEST_PYPI_VERSION" ]; then
-    # these packages are damaged in test-pypi, so install them first
    uv pip install fastapi libcst
    uv pip install --extra-index-url https://test.pypi.org/simple/ \
      llama-stack=="$TEST_PYPI_VERSION" \
-      "$pip_dependencies"
-    if [ -n "$special_pip_deps" ]; then
-      IFS='#' read -ra parts <<<"$special_pip_deps"
+      "$normal_deps"
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
+      for part in "${parts[@]}"; do
+        echo "$part"
+        uv pip install $part
+      done
+    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
      for part in "${parts[@]}"; do
        echo "$part"
        uv pip install "$part"
      done
    fi
  else
-    # Re-installing llama-stack in the new conda environment
    if [ -n "$LLAMA_STACK_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: $LLAMA_STACK_DIR${NC}\n" >&2
        exit 1
      fi
-
      printf "Installing from LLAMA_STACK_DIR: $LLAMA_STACK_DIR\n"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
    else
@ -115,31 +164,44 @@ ensure_conda_env_python310() {
      fi
      uv pip install --no-cache-dir "$SPEC_VERSION"
    fi
-
    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: $LLAMA_STACK_CLIENT_DIR${NC}\n" >&2
        exit 1
      fi
-
      printf "Installing from LLAMA_STACK_CLIENT_DIR: $LLAMA_STACK_CLIENT_DIR\n"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
    fi
-
-    # Install pip dependencies
    printf "Installing pip dependencies\n"
-    uv pip install $pip_dependencies
-    if [ -n "$special_pip_deps" ]; then
-      IFS='#' read -ra parts <<<"$special_pip_deps"
+    uv pip install $normal_deps
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
      for part in "${parts[@]}"; do
        echo "$part"
        uv pip install $part
      done
    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
+      for part in "${parts[@]}"; do
+        echo "Getting provider spec for module: $part and installing dependencies"
+        package_name=$(echo "$part" | sed 's/[<>=!].*//')
+        python3 -c "
+import importlib
+import sys
+try:
+    module = importlib.import_module(f'$package_name.provider')
+    spec = module.get_provider_spec()
+    if hasattr(spec, 'pip_packages') and spec.pip_packages:
+        print('\\n'.join(spec.pip_packages))
+except Exception as e:
+    print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
+" | uv pip install -r -
+      done
+    fi
  fi
-
  mv "$build_file_path" "$CONDA_PREFIX"/llamastack-build.yaml
  echo "Build spec configuration saved at $CONDA_PREFIX/llamastack-build.yaml"
 }

-ensure_conda_env_python310 "$env_name" "$pip_dependencies" "$special_pip_deps"
+ensure_conda_env_python310 "$env_name" "$build_file_path" "$normal_deps" "$optional_deps" "$external_provider_deps"
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -18,58 +18,108 @@ UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}

 # mounting is not supported by docker buildx, so we use COPY instead
 USE_COPY_NOT_MOUNT=${USE_COPY_NOT_MOUNT:-}
-
 # Path to the run.yaml file in the container
 RUN_CONFIG_PATH=/app/run.yaml

 BUILD_CONTEXT_DIR=$(pwd)

-if [ "$#" -lt 4 ]; then
-  # This only works for templates
-  echo "Usage: $0 <template_or_config> <image_name> <container_base> <pip_dependencies> [<run_config>] [<special_pip_deps>]" >&2
-  exit 1
-fi
 set -euo pipefail

-template_or_config="$1"
-shift
-image_name="$1"
-shift
-container_base="$1"
-shift
-pip_dependencies="$1"
-shift
-
-# Handle optional arguments
-run_config=""
-special_pip_deps=""
-
-# Check if there are more arguments
-# The logics is becoming cumbersom, we should refactor it if we can do better
-if [ $# -gt 0 ]; then
-  # Check if the argument ends with .yaml
-  if [[ "$1" == *.yaml ]]; then
-    run_config="$1"
-    shift
-    # If there's another argument after .yaml, it must be special_pip_deps
-    if [ $# -gt 0 ]; then
-      special_pip_deps="$1"
-    fi
-  else
-    # If it's not .yaml, it must be special_pip_deps
-    special_pip_deps="$1"
-  fi
-fi
-
 # Define color codes
 RED='\033[0;31m'
 NC='\033[0m' # No Color

+# Usage function
+usage() {
+  echo "Usage: $0 --image-name <image_name> --container-base <container_base> --normal-deps <pip_dependencies> [--run-config <run_config>] [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
+  echo "Example: $0 --image-name llama-stack-img --container-base python:3.12-slim --normal-deps 'numpy pandas' --run-config ./run.yaml --external-provider-deps 'foo' --optional-deps 'bar'"
+  exit 1
+}
+
+# Parse arguments
+image_name=""
+container_base=""
+normal_deps=""
+external_provider_deps=""
+optional_deps=""
+run_config=""
+distro_or_config=""
+
+while [[ $# -gt 0 ]]; do
+  key="$1"
+  case "$key" in
+    --image-name)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --image-name requires a string value" >&2
+        usage
+      fi
+      image_name="$2"
+      shift 2
+      ;;
+    --container-base)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --container-base requires a string value" >&2
+        usage
+      fi
+      container_base="$2"
+      shift 2
+      ;;
+    --normal-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --normal-deps requires a string value" >&2
+        usage
+      fi
+      normal_deps="$2"
+      shift 2
+      ;;
+    --external-provider-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --external-provider-deps requires a string value" >&2
+        usage
+      fi
+      external_provider_deps="$2"
+      shift 2
+      ;;
+    --optional-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --optional-deps requires a string value" >&2
+        usage
+      fi
+      optional_deps="$2"
+      shift 2
+      ;;
+    --run-config)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --run-config requires a string value" >&2
+        usage
+      fi
+      run_config="$2"
+      shift 2
+      ;;
+    --distro-or-config)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --distro-or-config requires a string value" >&2
+        usage
+      fi
+      distro_or_config="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      ;;
+  esac
+done
+
+# Check required arguments
+if [[ -z "$image_name" || -z "$container_base" || -z "$normal_deps" ]]; then
+  echo "Error: --image-name, --container-base, and --normal-deps are required." >&2
+  usage
+fi
+
 CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
 CONTAINER_OPTS=${CONTAINER_OPTS:---progress=plain}
-
 TEMP_DIR=$(mktemp -d)
-
 SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
 source "$SCRIPT_DIR/common.sh"

@ -78,25 +128,22 @@ add_to_container() {
  if [ -t 0 ]; then
    printf '%s\n' "$1" >>"$output_file"
  else
-    # If stdin is not a terminal, read from it (heredoc)
    cat >>"$output_file"
  fi
 }

-# Check if container command is available
 if ! is_command_available "$CONTAINER_BINARY"; then
  printf "${RED}Error: ${CONTAINER_BINARY} command not found. Is ${CONTAINER_BINARY} installed and in your PATH?${NC}" >&2
  exit 1
 fi

-# Update and install UBI9 components if UBI9 base image is used
 if [[ $container_base == *"registry.access.redhat.com/ubi9"* ]]; then
  add_to_container << EOF
 FROM $container_base
 WORKDIR /app

 # We install the Python 3.12 dev headers and build tools so that any
-# C‑extension wheels (e.g. polyleven, faiss‑cpu) can compile successfully.
+# C-extension wheels (e.g. polyleven, faiss-cpu) can compile successfully.

 RUN dnf -y update && dnf install -y iputils git net-tools wget \
    vim-minimal python3.12 python3.12-pip python3.12-wheel \
@ -127,22 +174,52 @@ fi

 # Add pip dependencies first since llama-stack is what will change most often
 # so we can reuse layers.
-if [ -n "$pip_dependencies" ]; then
+if [ -n "$normal_deps" ]; then
+  read -ra pip_args <<<  "$normal_deps"
+  quoted_deps=$(printf " %q" "${pip_args[@]}")
  add_to_container << EOF
-RUN uv pip install --no-cache $pip_dependencies
+RUN uv pip install --no-cache $quoted_deps
 EOF
 fi

-if [ -n "$special_pip_deps" ]; then
-  IFS='#' read -ra parts <<<"$special_pip_deps"
+if [ -n "$optional_deps" ]; then
+  IFS='#' read -ra parts <<<"$optional_deps"
  for part in "${parts[@]}"; do
+    read -ra pip_args <<< "$part"
+    quoted_deps=$(printf " %q" "${pip_args[@]}")
    add_to_container <<EOF
-RUN uv pip install --no-cache $part
+RUN uv pip install --no-cache $quoted_deps
+EOF
+  done
+fi
+
+if [ -n "$external_provider_deps" ]; then
+  IFS='#' read -ra parts <<<"$external_provider_deps"
+  for part in "${parts[@]}"; do
+    read -ra pip_args <<< "$part"
+    quoted_deps=$(printf " %q" "${pip_args[@]}")
+    add_to_container <<EOF
+RUN uv pip install --no-cache $quoted_deps
+EOF
+    add_to_container <<EOF
+RUN python3 - <<PYTHON | uv pip install --no-cache -r -
+import importlib
+import sys
+
+try:
+    package_name = '$part'.split('==')[0].split('>=')[0].split('<=')[0].split('!=')[0].split('<')[0].split('>')[0]
+    module = importlib.import_module(f'{package_name}.provider')
+    spec = module.get_provider_spec()
+    if hasattr(spec, 'pip_packages') and spec.pip_packages:
+        if isinstance(spec.pip_packages, (list, tuple)):
+            print('\n'.join(spec.pip_packages))
+except Exception as e:
+    print(f'Error getting provider spec for {package_name}: {e}', file=sys.stderr)
+PYTHON
 EOF
  done
 fi

-# Function to get Python command
 get_python_cmd() {
    if is_command_available python; then
        echo "python"
@ -169,7 +246,7 @@ if [ -n "$run_config" ]; then
    echo "Copying external providers directory: $external_providers_dir"
    cp -r "$external_providers_dir" "$BUILD_CONTEXT_DIR/providers.d"
    add_to_container << EOF
-COPY --chmod=g+w providers.d /.llama/providers.d
+COPY providers.d /.llama/providers.d
 EOF
    fi

@ -222,7 +299,7 @@ else
  if [ -n "$TEST_PYPI_VERSION" ]; then
    # these packages are damaged in test-pypi, so install them first
    add_to_container << EOF
-RUN uv pip install fastapi libcst
+RUN uv pip install --no-cache fastapi libcst
 EOF
    add_to_container << EOF
 RUN uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \
@ -250,12 +327,11 @@ EOF
 # If a run config is provided, we use the --config flag
 if [[ -n "$run_config" ]]; then
  add_to_container << EOF
-ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--config", "$RUN_CONFIG_PATH"]
+ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$RUN_CONFIG_PATH"]
 EOF
-# If a template is provided (not a yaml file), we use the --template flag
-elif [[ "$template_or_config" != *.yaml ]]; then
+elif [[ "$distro_or_config" != *.yaml ]]; then
  add_to_container << EOF
-ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--template", "$template_or_config"]
+ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$distro_or_config"]
 EOF
 fi

@ -328,7 +404,7 @@ $CONTAINER_BINARY build \
  "$BUILD_CONTEXT_DIR"

 # clean up tmp/configs
-rm -f "$BUILD_CONTEXT_DIR/run.yaml"
+rm -rf "$BUILD_CONTEXT_DIR/run.yaml" "$TEMP_DIR"
 set +x

 echo "Success!"
--- a/llama_stack/distribution/build_venv.sh
+++ b/llama_stack/distribution/build_venv.sh
@ -6,9 +6,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-# TODO: combine this with build_conda_env.sh since it is almost identical
-# the only difference is that we don't do any conda-specific setup
-
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
@ -18,6 +15,76 @@ UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
 UV_SYSTEM_PYTHON=${UV_SYSTEM_PYTHON:-}
 VIRTUAL_ENV=${VIRTUAL_ENV:-}

+set -euo pipefail
+
+# Define color codes
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+source "$SCRIPT_DIR/common.sh"
+
+# Usage function
+usage() {
+  echo "Usage: $0 --env-name <env_name> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
+  echo "Example: $0 --env-name mybuild --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
+  exit 1
+}
+
+# Parse arguments
+env_name=""
+normal_deps=""
+external_provider_deps=""
+optional_deps=""
+
+while [[ $# -gt 0 ]]; do
+  key="$1"
+  case "$key" in
+    --env-name)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --env-name requires a string value" >&2
+        usage
+      fi
+      env_name="$2"
+      shift 2
+      ;;
+    --normal-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --normal-deps requires a string value" >&2
+        usage
+      fi
+      normal_deps="$2"
+      shift 2
+      ;;
+    --external-provider-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --external-provider-deps requires a string value" >&2
+        usage
+      fi
+      external_provider_deps="$2"
+      shift 2
+      ;;
+    --optional-deps)
+      if [[ -z "$2" || "$2" == --* ]]; then
+        echo "Error: --optional-deps requires a string value" >&2
+        usage
+      fi
+      optional_deps="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage
+      ;;
+  esac
+done
+
+# Check required arguments
+if [[ -z "$env_name" || -z "$normal_deps" ]]; then
+  echo "Error: --env-name and --normal-deps are required." >&2
+  usage
+fi
+
 if [ -n "$LLAMA_STACK_DIR" ]; then
  echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
 fi
@ -25,29 +92,8 @@ if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
  echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
 fi

-if [ "$#" -lt 2 ]; then
-  echo "Usage: $0 <env_name> <pip_dependencies> [<special_pip_deps>]" >&2
-  echo "Example: $0 mybuild ./my-stack-build.yaml 'numpy pandas scipy'" >&2
-  exit 1
-fi
-
-special_pip_deps="$3"
-
-set -euo pipefail
-
-env_name="$1"
-pip_dependencies="$2"
-
-# Define color codes
-RED='\033[0;31m'
-NC='\033[0m' # No Color
-
-# this is set if we actually create a new conda in which case we need to clean up
 ENVNAME=""

-SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
-source "$SCRIPT_DIR/common.sh"
-
 # pre-run checks to make sure we can proceed with the installation
 pre_run_checks() {
  local env_name="$1"
@ -71,49 +117,44 @@ pre_run_checks() {
 }

 run() {
-  local env_name="$1"
-  local pip_dependencies="$2"
-  local special_pip_deps="$3"
-
+  # Use only global variables set by flag parser
  if [ -n "$UV_SYSTEM_PYTHON" ] || [ "$env_name" == "__system__" ]; then
    echo "Installing dependencies in system Python environment"
-    # if env == __system__, ensure we set UV_SYSTEM_PYTHON
    export UV_SYSTEM_PYTHON=1
  elif [ "$VIRTUAL_ENV" == "$env_name" ]; then
    echo "Virtual environment $env_name is already active"
  else
    echo "Using virtual environment $env_name"
    uv venv "$env_name"
-    # shellcheck source=/dev/null
    source "$env_name/bin/activate"
  fi

  if [ -n "$TEST_PYPI_VERSION" ]; then
-    # these packages are damaged in test-pypi, so install them first
    uv pip install fastapi libcst
-    # shellcheck disable=SC2086
-    # we are building a command line so word splitting is expected
    uv pip install --extra-index-url https://test.pypi.org/simple/ \
      --index-strategy unsafe-best-match \
      llama-stack=="$TEST_PYPI_VERSION" \
-      $pip_dependencies
-    if [ -n "$special_pip_deps" ]; then
-      IFS='#' read -ra parts <<<"$special_pip_deps"
+      $normal_deps
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
      for part in "${parts[@]}"; do
        echo "$part"
-        # shellcheck disable=SC2086
-        # we are building a command line so word splitting is expected
        uv pip install $part
      done
    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
+      for part in "${parts[@]}"; do
+        echo "$part"
+        uv pip install "$part"
+      done
+    fi
  else
-    # Re-installing llama-stack in the new virtual environment
    if [ -n "$LLAMA_STACK_DIR" ]; then
      if [ ! -d "$LLAMA_STACK_DIR" ]; then
        printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
        exit 1
      fi
-
      printf "Installing from LLAMA_STACK_DIR: %s\n"  "$LLAMA_STACK_DIR"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
    else
@ -125,27 +166,41 @@ run() {
        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
        exit 1
      fi
-
      printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
    fi

-    # Install pip dependencies
    printf "Installing pip dependencies\n"
-    # shellcheck disable=SC2086
-    # we are building a command line so word splitting is expected
-    uv pip install $pip_dependencies
-    if [ -n "$special_pip_deps" ]; then
-      IFS='#' read -ra parts <<<"$special_pip_deps"
+    uv pip install $normal_deps
+    if [ -n "$optional_deps" ]; then
+      IFS='#' read -ra parts <<<"$optional_deps"
      for part in "${parts[@]}"; do
-        echo "$part"
-        # shellcheck disable=SC2086
-        # we are building a command line so word splitting is expected
+        echo "Installing special provider module: $part"
        uv pip install $part
      done
    fi
+    if [ -n "$external_provider_deps" ]; then
+      IFS='#' read -ra parts <<<"$external_provider_deps"
+      for part in "${parts[@]}"; do
+        echo "Installing external provider module: $part"
+        uv pip install "$part"
+        echo "Getting provider spec for module: $part and installing dependencies"
+        package_name=$(echo "$part" | sed 's/[<>=!].*//')
+        python3 -c "
+import importlib
+import sys
+try:
+    module = importlib.import_module(f'$package_name.provider')
+    spec = module.get_provider_spec()
+    if hasattr(spec, 'pip_packages') and spec.pip_packages:
+        print('\\n'.join(spec.pip_packages))
+except Exception as e:
+    print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
+" | uv pip install -r -
+      done
+    fi
  fi
 }

 pre_run_checks "$env_name"
-run "$env_name" "$pip_dependencies" "$special_pip_deps"
+run
--- a/llama_stack/distribution/client.py
+++ b/llama_stack/distribution/client.py
--- a/llama_stack/distribution/common.sh
+++ b/llama_stack/distribution/common.sh
@ -7,12 +7,10 @@
 # the root directory of this source tree.

 cleanup() {
-  envname="$1"
-
-  set +x
-  echo "Cleaning up..."
-  conda deactivate
-  conda env remove --name "$envname" -y
+  # For venv environments, no special cleanup is needed
+  # This function exists to avoid "function not found" errors
+  local env_name="$1"
+  echo "Cleanup called for environment: $env_name"
 }

 handle_int() {
@ -31,19 +29,7 @@ handle_exit() {
  fi
 }

-setup_cleanup_handlers() {
-  trap handle_int INT
-  trap handle_exit EXIT

-  if is_command_available conda; then
-    __conda_setup="$('conda' 'shell.bash' 'hook' 2>/dev/null)"
-    eval "$__conda_setup"
-    conda deactivate
-  else
-    echo "conda is not available"
-    exit 1
-  fi
-}

 # check if a command is present
 is_command_available() {
--- a/llama_stack/distribution/configure.py
+++ b/llama_stack/distribution/configure.py
@ -7,20 +7,20 @@ import logging
 import textwrap
 from typing import Any

-from llama_stack.distribution.datatypes import (
+from llama_stack.core.datatypes import (
    LLAMA_STACK_RUN_CONFIG_VERSION,
    DistributionSpec,
    Provider,
    StackRunConfig,
 )
-from llama_stack.distribution.distribution import (
+from llama_stack.core.distribution import (
    builtin_automatically_routed_apis,
    get_provider_registry,
 )
-from llama_stack.distribution.stack import replace_env_vars
-from llama_stack.distribution.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
-from llama_stack.distribution.utils.dynamic import instantiate_class_type
-from llama_stack.distribution.utils.prompt_for_config import prompt_for_config
+from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
+from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
+from llama_stack.core.utils.dynamic import instantiate_class_type
+from llama_stack.core.utils.prompt_for_config import prompt_for_config
 from llama_stack.providers.datatypes import Api, ProviderSpec

 logger = logging.getLogger(__name__)
@ -91,21 +91,22 @@ def configure_api_providers(config: StackRunConfig, build_spec: DistributionSpec

            logger.info(f"Configuring API `{api_str}`...")
            updated_providers = []
-            for i, provider_type in enumerate(plist):
+            for i, provider in enumerate(plist):
                if i >= 1:
-                    others = ", ".join(plist[i:])
+                    others = ", ".join(p.provider_type for p in plist[i:])
                    logger.info(
                        f"Not configuring other providers ({others}) interactively. Please edit the resulting YAML directly.\n"
                    )
                    break

-                logger.info(f"> Configuring provider `({provider_type})`")
+                logger.info(f"> Configuring provider `({provider.provider_type})`")
+                pid = provider.provider_type.split("::")[-1]
                updated_providers.append(
                    configure_single_provider(
                        provider_registry[api],
                        Provider(
-                            provider_id=(f"{provider_type}-{i:02d}" if len(plist) > 1 else provider_type),
-                            provider_type=provider_type,
+                            provider_id=(f"{pid}-{i:02d}" if len(plist) > 1 else pid),
+                            provider_type=provider.provider_type,
                            config={},
                        ),
                    )
@ -164,7 +165,8 @@ def upgrade_from_routing_table(
 def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfig:
    version = config_dict.get("version", None)
    if version == LLAMA_STACK_RUN_CONFIG_VERSION:
-        return StackRunConfig(**replace_env_vars(config_dict))
+        processed_config_dict = replace_env_vars(config_dict)
+        return StackRunConfig(**cast_image_name_to_string(processed_config_dict))

    if "routing_table" in config_dict:
        logger.info("Upgrading config...")
@ -175,4 +177,5 @@ def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfi
    if not config_dict.get("external_providers_dir", None):
        config_dict["external_providers_dir"] = EXTERNAL_PROVIDERS_DIR

-    return StackRunConfig(**replace_env_vars(config_dict))
+    processed_config_dict = replace_env_vars(config_dict)
+    return StackRunConfig(**cast_image_name_to_string(processed_config_dict))
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -24,7 +24,7 @@ from llama_stack.apis.shields import Shield, ShieldInput
 from llama_stack.apis.tools import Tool, ToolGroup, ToolGroupInput, ToolRuntime
 from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
 from llama_stack.apis.vector_io import VectorIO
-from llama_stack.distribution.access_control.datatypes import AccessRule
+from llama_stack.core.access_control.datatypes import AccessRule
 from llama_stack.providers.datatypes import Api, ProviderSpec
 from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
 from llama_stack.providers.utils.sqlstore.sqlstore import SqlStoreConfig
@ -36,6 +36,11 @@ LLAMA_STACK_RUN_CONFIG_VERSION = 2
 RoutingKey = str | list[str]


+class RegistryEntrySource(StrEnum):
+    via_register_api = "via_register_api"
+    listed_from_provider = "listed_from_provider"
+
+
 class User(BaseModel):
    principal: str
    # further attributes that may be used for access control decisions
@ -50,6 +55,7 @@ class ResourceWithOwner(Resource):
    resource. This can be used to constrain access to the resource."""

    owner: User | None = None
+    source: RegistryEntrySource = RegistryEntrySource.via_register_api


 # Use the extended Resource for all routable objects
@ -130,29 +136,54 @@ class RoutingTableProviderSpec(ProviderSpec):
    pip_packages: list[str] = Field(default_factory=list)


+class Provider(BaseModel):
+    # provider_id of None means that the provider is not enabled - this happens
+    # when the provider is enabled via a conditional environment variable
+    provider_id: str | None
+    provider_type: str
+    config: dict[str, Any] = {}
+    module: str | None = Field(
+        default=None,
+        description="""
+ Fully-qualified name of the external provider module to import. The module is expected to have:
+
+  - `get_adapter_impl(config, deps)`: returns the adapter implementation
+
+  Example: `module: ramalama_stack`
+ """,
+    )
+
+
+class BuildProvider(BaseModel):
+    provider_type: str
+    module: str | None = Field(
+        default=None,
+        description="""
+ Fully-qualified name of the external provider module to import. The module is expected to have:
+
+  - `get_adapter_impl(config, deps)`: returns the adapter implementation
+
+  Example: `module: ramalama_stack`
+ """,
+    )
+
+
 class DistributionSpec(BaseModel):
    description: str | None = Field(
        default="",
        description="Description of the distribution",
    )
    container_image: str | None = None
-    providers: dict[str, str | list[str]] = Field(
+    providers: dict[str, list[BuildProvider]] = Field(
        default_factory=dict,
        description="""
-Provider Types for each of the APIs provided by this distribution. If you
-select multiple providers, you should provide an appropriate 'routing_map'
-in the runtime configuration to help route to the correct provider.""",
+        Provider Types for each of the APIs provided by this distribution. If you
+        select multiple providers, you should provide an appropriate 'routing_map'
+        in the runtime configuration to help route to the correct provider.
+        """,
    )


-class Provider(BaseModel):
-    # provider_id of None means that the provider is not enabled - this happens
-    # when the provider is enabled via a conditional environment variable
-    provider_id: str | None
-    provider_type: str
-    config: dict[str, Any]
-
-
 class LoggingConfig(BaseModel):
    category_levels: dict[str, str] = Field(
        default_factory=dict,
@ -381,6 +412,11 @@ a default SQLite store will be used.""",
        description="Path to directory containing external provider implementations. The providers code and dependencies must be installed on the system.",
    )

+    external_apis_dir: Path | None = Field(
+        default=None,
+        description="Path to directory containing external API implementations. The APIs code and dependencies must be installed on the system.",
+    )
+
    @field_validator("external_providers_dir")
    @classmethod
    def validate_external_providers_dir(cls, v):
@ -396,8 +432,8 @@ class BuildConfig(BaseModel):

    distribution_spec: DistributionSpec = Field(description="The distribution spec to build including API providers. ")
    image_type: str = Field(
-        default="conda",
-        description="Type of package to build (conda | container | venv)",
+        default="venv",
+        description="Type of package to build (container | venv)",
    )
    image_name: str | None = Field(
        default=None,
@ -412,6 +448,10 @@ class BuildConfig(BaseModel):
        default_factory=list,
        description="Additional pip packages to install in the distribution. These packages will be installed in the distribution environment.",
    )
+    external_apis_dir: Path | None = Field(
+        default=None,
+        description="Path to directory containing external API implementations. The APIs code and dependencies must be installed on the system.",
+    )

    @field_validator("external_providers_dir")
    @classmethod
--- a/llama_stack/core/distribution.py
+++ b/llama_stack/core/distribution.py
@ -0,0 +1,277 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import glob
+import importlib
+import os
+from typing import Any
+
+import yaml
+from pydantic import BaseModel
+
+from llama_stack.core.datatypes import BuildConfig, DistributionSpec
+from llama_stack.core.external import load_external_apis
+from llama_stack.log import get_logger
+from llama_stack.providers.datatypes import (
+    AdapterSpec,
+    Api,
+    InlineProviderSpec,
+    ProviderSpec,
+    remote_provider_spec,
+)
+
+logger = get_logger(name=__name__, category="core")
+
+
+def stack_apis() -> list[Api]:
+    return list(Api)
+
+
+class AutoRoutedApiInfo(BaseModel):
+    routing_table_api: Api
+    router_api: Api
+
+
+def builtin_automatically_routed_apis() -> list[AutoRoutedApiInfo]:
+    return [
+        AutoRoutedApiInfo(
+            routing_table_api=Api.models,
+            router_api=Api.inference,
+        ),
+        AutoRoutedApiInfo(
+            routing_table_api=Api.shields,
+            router_api=Api.safety,
+        ),
+        AutoRoutedApiInfo(
+            routing_table_api=Api.vector_dbs,
+            router_api=Api.vector_io,
+        ),
+        AutoRoutedApiInfo(
+            routing_table_api=Api.datasets,
+            router_api=Api.datasetio,
+        ),
+        AutoRoutedApiInfo(
+            routing_table_api=Api.scoring_functions,
+            router_api=Api.scoring,
+        ),
+        AutoRoutedApiInfo(
+            routing_table_api=Api.benchmarks,
+            router_api=Api.eval,
+        ),
+        AutoRoutedApiInfo(
+            routing_table_api=Api.tool_groups,
+            router_api=Api.tool_runtime,
+        ),
+    ]
+
+
+def providable_apis() -> list[Api]:
+    routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()}
+    return [api for api in Api if api not in routing_table_apis and api != Api.inspect and api != Api.providers]
+
+
+def _load_remote_provider_spec(spec_data: dict[str, Any], api: Api) -> ProviderSpec:
+    adapter = AdapterSpec(**spec_data["adapter"])
+    spec = remote_provider_spec(
+        api=api,
+        adapter=adapter,
+        api_dependencies=[Api(dep) for dep in spec_data.get("api_dependencies", [])],
+    )
+    return spec
+
+
+def _load_inline_provider_spec(spec_data: dict[str, Any], api: Api, provider_name: str) -> ProviderSpec:
+    spec = InlineProviderSpec(
+        api=api,
+        provider_type=f"inline::{provider_name}",
+        pip_packages=spec_data.get("pip_packages", []),
+        module=spec_data["module"],
+        config_class=spec_data["config_class"],
+        api_dependencies=[Api(dep) for dep in spec_data.get("api_dependencies", [])],
+        optional_api_dependencies=[Api(dep) for dep in spec_data.get("optional_api_dependencies", [])],
+        provider_data_validator=spec_data.get("provider_data_validator"),
+        container_image=spec_data.get("container_image"),
+    )
+    return spec
+
+
+def get_provider_registry(config=None) -> dict[Api, dict[str, ProviderSpec]]:
+    """Get the provider registry, optionally including external providers.
+
+    This function loads both built-in providers and external providers from YAML files or from their provided modules.
+    External providers are loaded from a directory structure like:
+
+    providers.d/
+      remote/
+        inference/
+          custom_ollama.yaml
+          vllm.yaml
+        vector_io/
+          qdrant.yaml
+        safety/
+          llama-guard.yaml
+      inline/
+        inference/
+          custom_ollama.yaml
+          vllm.yaml
+        vector_io/
+          qdrant.yaml
+        safety/
+          llama-guard.yaml
+
+    This method is overloaded in that it can be called from a variety of places: during build, during run, during stack construction.
+    So when building external providers from a module, there are scenarios where the pip package required to import the module might not be available yet.
+    There is special handling for all of the potential cases this method can be called from.
+
+    Args:
+        config: Optional object containing the external providers directory path
+        building: Optional bool delineating whether or not this is being called from a build process
+
+    Returns:
+        A dictionary mapping APIs to their available providers
+
+    Raises:
+        FileNotFoundError: If the external providers directory doesn't exist
+        ValueError: If any provider spec is invalid
+    """
+
+    registry: dict[Api, dict[str, ProviderSpec]] = {}
+    for api in providable_apis():
+        name = api.name.lower()
+        logger.debug(f"Importing module {name}")
+        try:
+            module = importlib.import_module(f"llama_stack.providers.registry.{name}")
+            registry[api] = {a.provider_type: a for a in module.available_providers()}
+        except ImportError as e:
+            logger.warning(f"Failed to import module {name}: {e}")
+
+    # Refresh providable APIs with external APIs if any
+    external_apis = load_external_apis(config)
+    for api, api_spec in external_apis.items():
+        name = api_spec.name.lower()
+        logger.info(f"Importing external API {name} module {api_spec.module}")
+        try:
+            module = importlib.import_module(api_spec.module)
+            registry[api] = {a.provider_type: a for a in module.available_providers()}
+        except (ImportError, AttributeError) as e:
+            # Populate the registry with an empty dict to avoid breaking the provider registry
+            # This assume that the in-tree provider(s) are not available for this API which means
+            # that users will need to use external providers for this API.
+            registry[api] = {}
+            logger.error(
+                f"Failed to import external API {name}: {e}. Could not populate the in-tree provider(s) registry for {api.name}. \n"
+                "Install the API package to load any in-tree providers for this API."
+            )
+
+    # Check if config has external providers
+    if config:
+        if hasattr(config, "external_providers_dir") and config.external_providers_dir:
+            registry = get_external_providers_from_dir(registry, config)
+        # else lets check for modules in each provider
+        registry = get_external_providers_from_module(
+            registry=registry,
+            config=config,
+            building=(isinstance(config, BuildConfig) or isinstance(config, DistributionSpec)),
+        )
+
+    return registry
+
+
+def get_external_providers_from_dir(
+    registry: dict[Api, dict[str, ProviderSpec]], config
+) -> dict[Api, dict[str, ProviderSpec]]:
+    logger.warning(
+        "Specifying external providers via `external_providers_dir` is being deprecated. Please specify `module:` in the provider instead."
+    )
+    external_providers_dir = os.path.abspath(os.path.expanduser(config.external_providers_dir))
+    if not os.path.exists(external_providers_dir):
+        raise FileNotFoundError(f"External providers directory not found: {external_providers_dir}")
+    logger.info(f"Loading external providers from {external_providers_dir}")
+
+    for api in providable_apis():
+        api_name = api.name.lower()
+
+        # Process both remote and inline providers
+        for provider_type in ["remote", "inline"]:
+            api_dir = os.path.join(external_providers_dir, provider_type, api_name)
+            if not os.path.exists(api_dir):
+                logger.debug(f"No {provider_type} provider directory found for {api_name}")
+                continue
+
+            # Look for provider spec files in the API directory
+            for spec_path in glob.glob(os.path.join(api_dir, "*.yaml")):
+                provider_name = os.path.splitext(os.path.basename(spec_path))[0]
+                logger.info(f"Loading {provider_type} provider spec from {spec_path}")
+
+                try:
+                    with open(spec_path) as f:
+                        spec_data = yaml.safe_load(f)
+
+                    if provider_type == "remote":
+                        spec = _load_remote_provider_spec(spec_data, api)
+                        provider_type_key = f"remote::{provider_name}"
+                    else:
+                        spec = _load_inline_provider_spec(spec_data, api, provider_name)
+                        provider_type_key = f"inline::{provider_name}"
+
+                    logger.info(f"Loaded {provider_type} provider spec for {provider_type_key} from {spec_path}")
+                    if provider_type_key in registry[api]:
+                        logger.warning(f"Overriding already registered provider {provider_type_key} for {api.name}")
+                    registry[api][provider_type_key] = spec
+                    logger.info(f"Successfully loaded external provider {provider_type_key}")
+                except yaml.YAMLError as yaml_err:
+                    logger.error(f"Failed to parse YAML file {spec_path}: {yaml_err}")
+                    raise yaml_err
+                except Exception as e:
+                    logger.error(f"Failed to load provider spec from {spec_path}: {e}")
+                    raise e
+
+    return registry
+
+
+def get_external_providers_from_module(
+    registry: dict[Api, dict[str, ProviderSpec]], config, building: bool
+) -> dict[Api, dict[str, ProviderSpec]]:
+    provider_list = None
+    if isinstance(config, BuildConfig):
+        provider_list = config.distribution_spec.providers.items()
+    else:
+        provider_list = config.providers.items()
+    if provider_list is None:
+        logger.warning("Could not get list of providers from config")
+        return registry
+    for provider_api, providers in provider_list:
+        for provider in providers:
+            if not hasattr(provider, "module") or provider.module is None:
+                continue
+            # get provider using module
+            try:
+                if not building:
+                    package_name = provider.module.split("==")[0]
+                    module = importlib.import_module(f"{package_name}.provider")
+                    # if config class is wrong you will get an error saying module could not be imported
+                    spec = module.get_provider_spec()
+                else:
+                    # pass in a partially filled out provider spec to satisfy the registry -- knowing we will be overwriting it later upon build and run
+                    spec = ProviderSpec(
+                        api=Api(provider_api),
+                        provider_type=provider.provider_type,
+                        is_external=True,
+                        module=provider.module,
+                        config_class="",
+                    )
+                provider_type = provider.provider_type
+                # in the case we are building we CANNOT import this module of course because it has not been installed.
+                # return a partially filled out spec that the build script will populate.
+                registry[Api(provider_api)][provider_type] = spec
+            except ModuleNotFoundError as exc:
+                raise ValueError(
+                    "get_provider_spec not found. If specifying an external provider via `module` in the Provider spec, the Provider must have the `provider.get_provider_spec` module available"
+                ) from exc
+            except Exception as e:
+                logger.error(f"Failed to load provider spec from module {provider.module}: {e}")
+                raise e
+    return registry
--- a/llama_stack/core/external.py
+++ b/llama_stack/core/external.py
@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import yaml
+
+from llama_stack.apis.datatypes import Api, ExternalApiSpec
+from llama_stack.core.datatypes import BuildConfig, StackRunConfig
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="core")
+
+
+def load_external_apis(config: StackRunConfig | BuildConfig | None) -> dict[Api, ExternalApiSpec]:
+    """Load external API specifications from the configured directory.
+
+    Args:
+        config: StackRunConfig or BuildConfig containing the external APIs directory path
+
+    Returns:
+        A dictionary mapping API names to their specifications
+    """
+    if not config or not config.external_apis_dir:
+        return {}
+
+    external_apis_dir = config.external_apis_dir.expanduser().resolve()
+    if not external_apis_dir.is_dir():
+        logger.error(f"External APIs directory is not a directory: {external_apis_dir}")
+        return {}
+
+    logger.info(f"Loading external APIs from {external_apis_dir}")
+    external_apis: dict[Api, ExternalApiSpec] = {}
+
+    # Look for YAML files in the external APIs directory
+    for yaml_path in external_apis_dir.glob("*.yaml"):
+        try:
+            with open(yaml_path) as f:
+                spec_data = yaml.safe_load(f)
+
+            spec = ExternalApiSpec(**spec_data)
+            api = Api.add(spec.name)
+            logger.info(f"Loaded external API spec for {spec.name} from {yaml_path}")
+            external_apis[api] = spec
+        except yaml.YAMLError as yaml_err:
+            logger.error(f"Failed to parse YAML file {yaml_path}: {yaml_err}")
+            raise
+        except Exception:
+            logger.exception(f"Failed to load external API spec from {yaml_path}")
+            raise
+
+    return external_apis
--- a/llama_stack/distribution/inspect.py
+++ b/llama_stack/distribution/inspect.py
@ -15,8 +15,9 @@ from llama_stack.apis.inspect import (
    RouteInfo,
    VersionInfo,
 )
-from llama_stack.distribution.datatypes import StackRunConfig
-from llama_stack.distribution.server.routes import get_all_api_routes
+from llama_stack.core.datatypes import StackRunConfig
+from llama_stack.core.external import load_external_apis
+from llama_stack.core.server.routes import get_all_api_routes
 from llama_stack.providers.datatypes import HealthStatus


@ -42,7 +43,8 @@ class DistributionInspectImpl(Inspect):
        run_config: StackRunConfig = self.config.run_config

        ret = []
-        all_endpoints = get_all_api_routes()
+        external_apis = load_external_apis(run_config)
+        all_endpoints = get_all_api_routes(external_apis)
        for api, endpoints in all_endpoints.items():
            # Always include provider and inspect APIs, filter others based on run config
            if api.value in ["providers", "inspect"]:
@ -53,7 +55,8 @@ class DistributionInspectImpl(Inspect):
                            method=next(iter([m for m in e.methods if m != "HEAD"])),
                            provider_types=[],  # These APIs don't have "real" providers - they're internal to the stack
                        )
-                        for e in endpoints
+                        for e, _ in endpoints
+                        if e.methods is not None
                    ]
                )
            else:
@ -66,7 +69,8 @@ class DistributionInspectImpl(Inspect):
                                method=next(iter([m for m in e.methods if m != "HEAD"])),
                                provider_types=[p.provider_type for p in providers],
                            )
-                            for e in endpoints
+                            for e, _ in endpoints
+                            if e.methods is not None
                        ]
                    )

--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@ -12,11 +12,13 @@ import os
 import sys
 from concurrent.futures import ThreadPoolExecutor
 from enum import Enum
+from io import BytesIO
 from pathlib import Path
 from typing import Any, TypeVar, Union, get_args, get_origin

 import httpx
 import yaml
+from fastapi import Response as FastAPIResponse
 from llama_stack_client import (
    NOT_GIVEN,
    APIResponse,
@ -29,23 +31,23 @@ from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
 from termcolor import cprint

-from llama_stack.distribution.build import print_pip_install_help
-from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
-from llama_stack.distribution.datatypes import Api, BuildConfig, DistributionSpec
-from llama_stack.distribution.request_headers import (
+from llama_stack.core.build import print_pip_install_help
+from llama_stack.core.configure import parse_and_maybe_upgrade_config
+from llama_stack.core.datatypes import Api, BuildConfig, BuildProvider, DistributionSpec
+from llama_stack.core.request_headers import (
    PROVIDER_DATA_VAR,
    request_provider_data_context,
 )
-from llama_stack.distribution.resolver import ProviderRegistry
-from llama_stack.distribution.server.routes import find_matching_route, initialize_route_impls
-from llama_stack.distribution.stack import (
+from llama_stack.core.resolver import ProviderRegistry
+from llama_stack.core.server.routes import RouteImpls, find_matching_route, initialize_route_impls
+from llama_stack.core.stack import (
    construct_stack,
-    get_stack_run_config_from_template,
+    get_stack_run_config_from_distro,
    replace_env_vars,
 )
-from llama_stack.distribution.utils.config import redact_sensitive_fields
-from llama_stack.distribution.utils.context import preserve_contexts_async_generator
-from llama_stack.distribution.utils.exec import in_notebook
+from llama_stack.core.utils.config import redact_sensitive_fields
+from llama_stack.core.utils.context import preserve_contexts_async_generator
+from llama_stack.core.utils.exec import in_notebook
 from llama_stack.providers.utils.telemetry.tracing import (
    CURRENT_TRACE_CONTEXT,
    end_trace,
@ -112,22 +114,45 @@ def convert_to_pydantic(annotation: Any, value: Any) -> Any:
        raise ValueError(f"Failed to convert parameter {value} into {annotation}: {e}") from e


+class LibraryClientUploadFile:
+    """LibraryClient UploadFile object that mimics FastAPI's UploadFile interface."""
+
+    def __init__(self, filename: str, content: bytes):
+        self.filename = filename
+        self.content = content
+        self.content_type = "application/octet-stream"
+
+    async def read(self) -> bytes:
+        return self.content
+
+
+class LibraryClientHttpxResponse:
+    """LibraryClient httpx Response object for FastAPI Response conversion."""
+
+    def __init__(self, response):
+        self.content = response.body if isinstance(response.body, bytes) else response.body.encode()
+        self.status_code = response.status_code
+        self.headers = response.headers
+
+
 class LlamaStackAsLibraryClient(LlamaStackClient):
    def __init__(
        self,
-        config_path_or_template_name: str,
+        config_path_or_distro_name: str,
        skip_logger_removal: bool = False,
        custom_provider_registry: ProviderRegistry | None = None,
        provider_data: dict[str, Any] | None = None,
    ):
        super().__init__()
        self.async_client = AsyncLlamaStackAsLibraryClient(
-            config_path_or_template_name, custom_provider_registry, provider_data
+            config_path_or_distro_name, custom_provider_registry, provider_data
        )
        self.pool_executor = ThreadPoolExecutor(max_workers=4)
        self.skip_logger_removal = skip_logger_removal
        self.provider_data = provider_data

+        self.loop = asyncio.new_event_loop()
+
    def initialize(self):
        if in_notebook():
            import nest_asyncio
@ -136,7 +161,13 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
            if not self.skip_logger_removal:
                self._remove_root_logger_handlers()

-        return asyncio.run(self.async_client.initialize())
+        # use a new event loop to avoid interfering with the main event loop
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            return loop.run_until_complete(self.async_client.initialize())
+        finally:
+            asyncio.set_event_loop(None)

    def _remove_root_logger_handlers(self):
        """
@ -149,10 +180,7 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
            logger.info(f"Removed handler {handler.__class__.__name__} from root logger")

    def request(self, *args, **kwargs):
-        # NOTE: We are using AsyncLlamaStackClient under the hood
-        # A new event loop is needed to convert the AsyncStream
-        # from async client into SyncStream return type for streaming
-        loop = asyncio.new_event_loop()
+        loop = self.loop
        asyncio.set_event_loop(loop)

        if kwargs.get("stream"):
@ -169,7 +197,6 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
                    pending = asyncio.all_tasks(loop)
                    if pending:
                        loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
-                    loop.close()

            return sync_generator()
        else:
@ -179,14 +206,13 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
                pending = asyncio.all_tasks(loop)
                if pending:
                    loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
-                loop.close()
            return result


 class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
    def __init__(
        self,
-        config_path_or_template_name: str,
+        config_path_or_distro_name: str,
        custom_provider_registry: ProviderRegistry | None = None,
        provider_data: dict[str, Any] | None = None,
    ):
@ -196,20 +222,21 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        current_sinks = os.environ.get("TELEMETRY_SINKS", "sqlite").split(",")
        os.environ["TELEMETRY_SINKS"] = ",".join(sink for sink in current_sinks if sink != "console")

-        if config_path_or_template_name.endswith(".yaml"):
-            config_path = Path(config_path_or_template_name)
+        if config_path_or_distro_name.endswith(".yaml"):
+            config_path = Path(config_path_or_distro_name)
            if not config_path.exists():
                raise ValueError(f"Config file {config_path} does not exist")
            config_dict = replace_env_vars(yaml.safe_load(config_path.read_text()))
            config = parse_and_maybe_upgrade_config(config_dict)
        else:
-            # template
-            config = get_stack_run_config_from_template(config_path_or_template_name)
+            # distribution
+            config = get_stack_run_config_from_distro(config_path_or_distro_name)

-        self.config_path_or_template_name = config_path_or_template_name
+        self.config_path_or_distro_name = config_path_or_distro_name
        self.config = config
        self.custom_provider_registry = custom_provider_registry
        self.provider_data = provider_data
+        self.route_impls: RouteImpls | None = None  # Initialize to None to prevent AttributeError

    async def initialize(self) -> bool:
        try:
@ -218,20 +245,21 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        except ModuleNotFoundError as _e:
            cprint(_e.msg, color="red", file=sys.stderr)
            cprint(
-                "Using llama-stack as a library requires installing dependencies depending on the template (providers) you choose.\n",
+                "Using llama-stack as a library requires installing dependencies depending on the distribution (providers) you choose.\n",
                color="yellow",
                file=sys.stderr,
            )
-            if self.config_path_or_template_name.endswith(".yaml"):
-                # Convert Provider objects to their types
-                provider_types: dict[str, str | list[str]] = {}
-                for api, providers in self.config.providers.items():
-                    types = [p.provider_type for p in providers]
-                    # Convert single-item lists to strings
-                    provider_types[api] = types[0] if len(types) == 1 else types
+            if self.config_path_or_distro_name.endswith(".yaml"):
+                providers: dict[str, list[BuildProvider]] = {}
+                for api, run_providers in self.config.providers.items():
+                    for provider in run_providers:
+                        providers.setdefault(api, []).append(
+                            BuildProvider(provider_type=provider.provider_type, module=provider.module)
+                        )
+                providers = dict(providers)
                build_config = BuildConfig(
                    distribution_spec=DistributionSpec(
-                        providers=provider_types,
+                        providers=providers,
                    ),
                    external_providers_dir=self.config.external_providers_dir,
                )
@ -239,7 +267,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
            else:
                prefix = "!" if in_notebook() else ""
                cprint(
-                    f"Please run:\n\n{prefix}llama stack build --template {self.config_path_or_template_name} --image-type venv\n\n",
+                    f"Please run:\n\n{prefix}llama stack build --distro {self.config_path_or_distro_name} --image-type venv\n\n",
                    "yellow",
                    file=sys.stderr,
                )
@ -255,7 +283,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):

        if not os.environ.get("PYTEST_CURRENT_TEST"):
            console = Console()
-            console.print(f"Using config [blue]{self.config_path_or_template_name}[/blue]:")
+            console.print(f"Using config [blue]{self.config_path_or_distro_name}[/blue]:")
            safe_config = redact_sensitive_fields(self.config.model_dump())
            console.print(yaml.dump(safe_config, indent=2))

@ -270,8 +298,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        stream=False,
        stream_cls=None,
    ):
-        if not self.route_impls:
-            raise ValueError("Client not initialized")
+        if self.route_impls is None:
+            raise ValueError("Client not initialized. Please call initialize() first.")

        # Create headers with provider data if available
        headers = options.headers or {}
@ -295,30 +323,63 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                )
            return response

+    def _handle_file_uploads(self, options: Any, body: dict) -> tuple[dict, list[str]]:
+        """Handle file uploads from OpenAI client and add them to the request body."""
+        if not (hasattr(options, "files") and options.files):
+            return body, []
+
+        if not isinstance(options.files, list):
+            return body, []
+
+        field_names = []
+        for file_tuple in options.files:
+            if not (isinstance(file_tuple, tuple) and len(file_tuple) >= 2):
+                continue
+
+            field_name = file_tuple[0]
+            file_object = file_tuple[1]
+
+            if isinstance(file_object, BytesIO):
+                file_object.seek(0)
+                file_content = file_object.read()
+                filename = getattr(file_object, "name", "uploaded_file")
+                field_names.append(field_name)
+                body[field_name] = LibraryClientUploadFile(filename, file_content)
+
+        return body, field_names
+
    async def _call_non_streaming(
        self,
        *,
        cast_to: Any,
        options: Any,
    ):
-        if self.route_impls is None:
-            raise ValueError("Client not initialized")
-
+        assert self.route_impls is not None  # Should be guaranteed by request() method, assertion for mypy
        path = options.url
        body = options.params or {}
        body |= options.json_data or {}

-        matched_func, path_params, route = find_matching_route(options.method, path, self.route_impls)
+        matched_func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
        body |= path_params
-        body = self._convert_body(path, options.method, body)
-        await start_trace(route, {"__location__": "library_client"})
+
+        body, field_names = self._handle_file_uploads(options, body)
+
+        body = self._convert_body(path, options.method, body, exclude_params=set(field_names))
+
+        trace_path = webmethod.descriptive_name or route_path
+        await start_trace(trace_path, {"__location__": "library_client"})
        try:
            result = await matched_func(**body)
        finally:
            await end_trace()

+        # Handle FastAPI Response objects (e.g., from file content retrieval)
+        if isinstance(result, FastAPIResponse):
+            return LibraryClientHttpxResponse(result)
+
        json_content = json.dumps(convert_pydantic_to_json_value(result))

+        filtered_body = {k: v for k, v in body.items() if not isinstance(v, LibraryClientUploadFile)}
        mock_response = httpx.Response(
            status_code=httpx.codes.OK,
            content=json_content.encode("utf-8"),
@ -330,7 +391,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                url=options.url,
                params=options.params,
                headers=options.headers or {},
-                json=convert_pydantic_to_json_value(body),
+                json=convert_pydantic_to_json_value(filtered_body),
            ),
        )
        response = APIResponse(
@ -350,18 +411,17 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        options: Any,
        stream_cls: Any,
    ):
-        if self.route_impls is None:
-            raise ValueError("Client not initialized")
-
+        assert self.route_impls is not None  # Should be guaranteed by request() method, assertion for mypy
        path = options.url
        body = options.params or {}
        body |= options.json_data or {}
-        func, path_params, route = find_matching_route(options.method, path, self.route_impls)
+        func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
        body |= path_params

        body = self._convert_body(path, options.method, body)

-        await start_trace(route, {"__location__": "library_client"})
+        trace_path = webmethod.descriptive_name or route_path
+        await start_trace(trace_path, {"__location__": "library_client"})

        async def gen():
            try:
@ -392,8 +452,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        # we use asynchronous impl always internally and channel all requests to AsyncLlamaStackClient
        # however, the top-level caller may be a SyncAPIClient -- so its stream_cls might be a Stream (SyncStream)
        # so we need to convert it to AsyncStream
+        # mypy can't track runtime variables inside the [...] of a generic, so ignore that check
        args = get_args(stream_cls)
-        stream_cls = AsyncStream[args[0]]
+        stream_cls = AsyncStream[args[0]]  # type: ignore[valid-type]
        response = AsyncAPIResponse(
            raw=mock_response,
            client=self,
@ -404,14 +465,16 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        )
        return await response.parse()

-    def _convert_body(self, path: str, method: str, body: dict | None = None) -> dict:
+    def _convert_body(
+        self, path: str, method: str, body: dict | None = None, exclude_params: set[str] | None = None
+    ) -> dict:
        if not body:
            return {}

-        if self.route_impls is None:
-            raise ValueError("Client not initialized")
+        assert self.route_impls is not None  # Should be guaranteed by request() method, assertion for mypy
+        exclude_params = exclude_params or set()

-        func, _, _ = find_matching_route(method, path, self.route_impls)
+        func, _, _, _ = find_matching_route(method, path, self.route_impls)
        sig = inspect.signature(func)

        # Strip NOT_GIVENs to use the defaults in signature
@ -422,6 +485,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        for param_name, param in sig.parameters.items():
            if param_name in body:
                value = body.get(param_name)
-                converted_body[param_name] = convert_to_pydantic(param.annotation, value)
+                if param_name in exclude_params:
+                    converted_body[param_name] = value
+                else:
+                    converted_body[param_name] = convert_to_pydantic(param.annotation, value)

        return converted_body
--- a/llama_stack/distribution/providers.py
+++ b/llama_stack/distribution/providers.py
--- a/llama_stack/distribution/request_headers.py
+++ b/llama_stack/distribution/request_headers.py
@ -10,7 +10,7 @@ import logging
 from contextlib import AbstractContextManager
 from typing import Any

-from llama_stack.distribution.datatypes import User
+from llama_stack.core.datatypes import User

 from .utils.dynamic import instantiate_class_type

@ -101,3 +101,15 @@ def get_authenticated_user() -> User | None:
    if not provider_data:
        return None
    return provider_data.get("__authenticated_user")
+
+
+def user_from_scope(scope: dict) -> User | None:
+    """Create a User object from ASGI scope data (set by authentication middleware)"""
+    user_attributes = scope.get("user_attributes", {})
+    principal = scope.get("principal", "")
+
+    # auth not enabled
+    if not principal and not user_attributes:
+        return None
+
+    return User(principal=principal, attributes=user_attributes)
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -11,6 +11,7 @@ from llama_stack.apis.agents import Agents
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
+from llama_stack.apis.datatypes import ExternalApiSpec
 from llama_stack.apis.eval import Eval
 from llama_stack.apis.files import Files
 from llama_stack.apis.inference import Inference, InferenceProvider
@ -26,17 +27,18 @@ from llama_stack.apis.telemetry import Telemetry
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_dbs import VectorDBs
 from llama_stack.apis.vector_io import VectorIO
-from llama_stack.distribution.client import get_client_impl
-from llama_stack.distribution.datatypes import (
+from llama_stack.core.client import get_client_impl
+from llama_stack.core.datatypes import (
    AccessRule,
    AutoRoutedProviderSpec,
    Provider,
    RoutingTableProviderSpec,
    StackRunConfig,
 )
-from llama_stack.distribution.distribution import builtin_automatically_routed_apis
-from llama_stack.distribution.store import DistributionRegistry
-from llama_stack.distribution.utils.dynamic import instantiate_class_type
+from llama_stack.core.distribution import builtin_automatically_routed_apis
+from llama_stack.core.external import load_external_apis
+from llama_stack.core.store import DistributionRegistry
+from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import (
    Api,
@ -59,8 +61,16 @@ class InvalidProviderError(Exception):
    pass


-def api_protocol_map() -> dict[Api, Any]:
-    return {
+def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) -> dict[Api, Any]:
+    """Get a mapping of API types to their protocol classes.
+
+    Args:
+        external_apis: Optional dictionary of external API specifications
+
+    Returns:
+        Dictionary mapping API types to their protocol classes
+    """
+    protocols = {
        Api.providers: ProvidersAPI,
        Api.agents: Agents,
        Api.inference: Inference,
@ -83,10 +93,23 @@ def api_protocol_map() -> dict[Api, Any]:
        Api.files: Files,
    }

+    if external_apis:
+        for api, api_spec in external_apis.items():
+            try:
+                module = importlib.import_module(api_spec.module)
+                api_class = getattr(module, api_spec.protocol)

-def api_protocol_map_for_compliance_check() -> dict[Api, Any]:
+                protocols[api] = api_class
+            except (ImportError, AttributeError):
+                logger.exception(f"Failed to load external API {api_spec.name}")
+
+    return protocols
+
+
+def api_protocol_map_for_compliance_check(config: Any) -> dict[Api, Any]:
+    external_apis = load_external_apis(config)
    return {
-        **api_protocol_map(),
+        **api_protocol_map(external_apis),
        Api.inference: InferenceProvider,
    }

@ -160,7 +183,7 @@ def specs_for_autorouted_apis(apis_to_serve: list[str] | set[str]) -> dict[str,
                spec=RoutingTableProviderSpec(
                    api=info.routing_table_api,
                    router_api=info.router_api,
-                    module="llama_stack.distribution.routers",
+                    module="llama_stack.core.routers",
                    api_dependencies=[],
                    deps__=[f"inner-{info.router_api.value}"],
                ),
@ -174,7 +197,7 @@ def specs_for_autorouted_apis(apis_to_serve: list[str] | set[str]) -> dict[str,
                config={},
                spec=AutoRoutedProviderSpec(
                    api=info.router_api,
-                    module="llama_stack.distribution.routers",
+                    module="llama_stack.core.routers",
                    routing_table_api=info.routing_table_api,
                    api_dependencies=[info.routing_table_api],
                    # Add telemetry as an optional dependency to all auto-routed providers
@ -200,7 +223,7 @@ def validate_and_prepare_providers(
        specs = {}
        for provider in providers:
            if not provider.provider_id or provider.provider_id == "__disabled__":
-                logger.warning(f"Provider `{provider.provider_type}` for API `{api}` is disabled")
+                logger.debug(f"Provider `{provider.provider_type}` for API `{api}` is disabled")
                continue

            validate_provider(provider, api, provider_registry)
@ -250,7 +273,7 @@ async def instantiate_providers(
    dist_registry: DistributionRegistry,
    run_config: StackRunConfig,
    policy: list[AccessRule],
-) -> dict:
+) -> dict[Api, Any]:
    """Instantiates providers asynchronously while managing dependencies."""
    impls: dict[Api, Any] = {}
    inner_impls_by_provider_id: dict[str, dict[str, Any]] = {f"inner-{x.value}": {} for x in router_apis}
@ -322,7 +345,7 @@ async def instantiate_provider(
    policy: list[AccessRule],
 ):
    provider_spec = provider.spec
-    if not hasattr(provider_spec, "module"):
+    if not hasattr(provider_spec, "module") or provider_spec.module is None:
        raise AttributeError(f"ProviderSpec of type {type(provider_spec)} does not have a 'module' attribute")

    logger.debug(f"Instantiating provider {provider.provider_id} from {provider_spec.module}")
@ -360,7 +383,7 @@ async def instantiate_provider(
    impl.__provider_spec__ = provider_spec
    impl.__provider_config__ = config

-    protocols = api_protocol_map_for_compliance_check()
+    protocols = api_protocol_map_for_compliance_check(run_config)
    additional_protocols = additional_protocols_map()
    # TODO: check compliance for special tool groups
    # the impl should be for Api.tool_runtime, the name should be the special tool group, the protocol should be the special tool group protocol
--- a/llama_stack/distribution/routers/init.py
+++ b/llama_stack/distribution/routers/init.py
@ -6,9 +6,9 @@

 from typing import Any

-from llama_stack.distribution.datatypes import AccessRule, RoutedProtocol
-from llama_stack.distribution.stack import StackRunConfig
-from llama_stack.distribution.store import DistributionRegistry
+from llama_stack.core.datatypes import AccessRule, RoutedProtocol
+from llama_stack.core.stack import StackRunConfig
+from llama_stack.core.store import DistributionRegistry
 from llama_stack.providers.datatypes import Api, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore

--- a/llama_stack/distribution/routers/datasets.py
+++ b/llama_stack/distribution/routers/datasets.py
@ -57,7 +57,8 @@ class DatasetIORouter(DatasetIO):
        logger.debug(
            f"DatasetIORouter.iterrows: {dataset_id}, {start_index=} {limit=}",
        )
-        return await self.routing_table.get_provider_impl(dataset_id).iterrows(
+        provider = await self.routing_table.get_provider_impl(dataset_id)
+        return await provider.iterrows(
            dataset_id=dataset_id,
            start_index=start_index,
            limit=limit,
@ -65,7 +66,8 @@ class DatasetIORouter(DatasetIO):

    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
        logger.debug(f"DatasetIORouter.append_rows: {dataset_id}, {len(rows)} rows")
-        return await self.routing_table.get_provider_impl(dataset_id).append_rows(
+        provider = await self.routing_table.get_provider_impl(dataset_id)
+        return await provider.append_rows(
            dataset_id=dataset_id,
            rows=rows,
        )
--- a/llama_stack/distribution/routers/eval_scoring.py
+++ b/llama_stack/distribution/routers/eval_scoring.py
@ -44,7 +44,8 @@ class ScoringRouter(Scoring):
        logger.debug(f"ScoringRouter.score_batch: {dataset_id}")
        res = {}
        for fn_identifier in scoring_functions.keys():
-            score_response = await self.routing_table.get_provider_impl(fn_identifier).score_batch(
+            provider = await self.routing_table.get_provider_impl(fn_identifier)
+            score_response = await provider.score_batch(
                dataset_id=dataset_id,
                scoring_functions={fn_identifier: scoring_functions[fn_identifier]},
            )
@ -66,7 +67,8 @@ class ScoringRouter(Scoring):
        res = {}
        # look up and map each scoring function to its provider impl
        for fn_identifier in scoring_functions.keys():
-            score_response = await self.routing_table.get_provider_impl(fn_identifier).score(
+            provider = await self.routing_table.get_provider_impl(fn_identifier)
+            score_response = await provider.score(
                input_rows=input_rows,
                scoring_functions={fn_identifier: scoring_functions[fn_identifier]},
            )
@ -97,7 +99,8 @@ class EvalRouter(Eval):
        benchmark_config: BenchmarkConfig,
    ) -> Job:
        logger.debug(f"EvalRouter.run_eval: {benchmark_id}")
-        return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
+        provider = await self.routing_table.get_provider_impl(benchmark_id)
+        return await provider.run_eval(
            benchmark_id=benchmark_id,
            benchmark_config=benchmark_config,
        )
@ -110,7 +113,8 @@ class EvalRouter(Eval):
        benchmark_config: BenchmarkConfig,
    ) -> EvaluateResponse:
        logger.debug(f"EvalRouter.evaluate_rows: {benchmark_id}, {len(input_rows)} rows")
-        return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
+        provider = await self.routing_table.get_provider_impl(benchmark_id)
+        return await provider.evaluate_rows(
            benchmark_id=benchmark_id,
            input_rows=input_rows,
            scoring_functions=scoring_functions,
@ -123,7 +127,8 @@ class EvalRouter(Eval):
        job_id: str,
    ) -> Job:
        logger.debug(f"EvalRouter.job_status: {benchmark_id}, {job_id}")
-        return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
+        provider = await self.routing_table.get_provider_impl(benchmark_id)
+        return await provider.job_status(benchmark_id, job_id)

    async def job_cancel(
        self,
@ -131,7 +136,8 @@ class EvalRouter(Eval):
        job_id: str,
    ) -> None:
        logger.debug(f"EvalRouter.job_cancel: {benchmark_id}, {job_id}")
-        await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
+        provider = await self.routing_table.get_provider_impl(benchmark_id)
+        await provider.job_cancel(
            benchmark_id,
            job_id,
        )
@ -142,7 +148,8 @@ class EvalRouter(Eval):
        job_id: str,
    ) -> EvaluateResponse:
        logger.debug(f"EvalRouter.job_result: {benchmark_id}, {job_id}")
-        return await self.routing_table.get_provider_impl(benchmark_id).job_result(
+        provider = await self.routing_table.get_provider_impl(benchmark_id)
+        return await provider.job_result(
            benchmark_id,
            job_id,
        )
--- a/llama_stack/distribution/routers/inference.py
+++ b/llama_stack/distribution/routers/inference.py
@ -7,6 +7,7 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
+from datetime import UTC, datetime
 from typing import Annotated, Any

 from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam
@ -17,6 +18,7 @@ from llama_stack.apis.common.content_types import (
    InterleavedContent,
    InterleavedContentItem,
 )
+from llama_stack.apis.common.errors import ModelNotFoundError
 from llama_stack.apis.inference import (
    BatchChatCompletionResponse,
    BatchCompletionResponse,
@ -24,14 +26,21 @@ from llama_stack.apis.inference import (
    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
    CompletionMessage,
+    CompletionResponse,
+    CompletionResponseStreamChunk,
    EmbeddingsResponse,
    EmbeddingTaskType,
    Inference,
    ListOpenAIChatCompletionResponse,
    LogProbConfig,
    Message,
+    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionToolCall,
+    OpenAIChatCompletionToolCallFunction,
+    OpenAIChoice,
+    OpenAIChoiceLogprobs,
    OpenAICompletion,
    OpenAICompletionWithInputMessages,
    OpenAIEmbeddingsResponse,
@ -54,7 +63,6 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
-from llama_stack.providers.utils.inference.stream_utils import stream_and_store_openai_completion
 from llama_stack.providers.utils.telemetry.tracing import get_current_span

 logger = get_logger(name=__name__, category="core")
@ -79,11 +87,9 @@ class InferenceRouter(Inference):

    async def initialize(self) -> None:
        logger.debug("InferenceRouter.initialize")
-        pass

    async def shutdown(self) -> None:
        logger.debug("InferenceRouter.shutdown")
-        pass

    async def register_model(
        self,
@ -120,6 +126,7 @@ class InferenceRouter(Inference):
        if span is None:
            logger.warning("No span found for token usage metrics")
            return []
+
        metrics = [
            ("prompt_tokens", prompt_tokens),
            ("completion_tokens", completion_tokens),
@ -133,7 +140,7 @@ class InferenceRouter(Inference):
                    span_id=span.span_id,
                    metric=metric_name,
                    value=value,
-                    timestamp=time.time(),
+                    timestamp=datetime.now(UTC),
                    unit="tokens",
                    attributes={
                        "model_id": model.model_id,
@ -190,7 +197,7 @@ class InferenceRouter(Inference):
            sampling_params = SamplingParams()
        model = await self.routing_table.get_model(model_id)
        if model is None:
-            raise ValueError(f"Model '{model_id}' not found")
+            raise ModelNotFoundError(model_id)
        if model.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
        if tool_config:
@ -231,53 +238,30 @@ class InferenceRouter(Inference):
            logprobs=logprobs,
            tool_config=tool_config,
        )
-        provider = self.routing_table.get_provider_impl(model_id)
+        provider = await self.routing_table.get_provider_impl(model_id)
        prompt_tokens = await self._count_tokens(messages, tool_config.tool_prompt_format)

        if stream:
-
-            async def stream_generator():
-                completion_text = ""
-                async for chunk in await provider.chat_completion(**params):
-                    if chunk.event.event_type == ChatCompletionResponseEventType.progress:
-                        if chunk.event.delta.type == "text":
-                            completion_text += chunk.event.delta.text
-                    if chunk.event.event_type == ChatCompletionResponseEventType.complete:
-                        completion_tokens = await self._count_tokens(
-                            [
-                                CompletionMessage(
-                                    content=completion_text,
-                                    stop_reason=StopReason.end_of_turn,
-                                )
-                            ],
-                            tool_config.tool_prompt_format,
-                        )
-                        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-                        metrics = await self._compute_and_log_token_usage(
-                            prompt_tokens or 0,
-                            completion_tokens or 0,
-                            total_tokens,
-                            model,
-                        )
-                        chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics
-                    yield chunk
-
-            return stream_generator()
-        else:
-            response = await provider.chat_completion(**params)
-            completion_tokens = await self._count_tokens(
-                [response.completion_message],
-                tool_config.tool_prompt_format,
+            response_stream = await provider.chat_completion(**params)
+            return self.stream_tokens_and_compute_metrics(
+                response=response_stream,
+                prompt_tokens=prompt_tokens,
+                model=model,
+                tool_prompt_format=tool_config.tool_prompt_format,
            )
-            total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-            metrics = await self._compute_and_log_token_usage(
-                prompt_tokens or 0,
-                completion_tokens or 0,
-                total_tokens,
-                model,
-            )
-            response.metrics = metrics if response.metrics is None else response.metrics + metrics
-            return response
+
+        response = await provider.chat_completion(**params)
+        metrics = await self.count_tokens_and_compute_metrics(
+            response=response,
+            prompt_tokens=prompt_tokens,
+            model=model,
+            tool_prompt_format=tool_config.tool_prompt_format,
+        )
+        # these metrics will show up in the client response.
+        response.metrics = (
+            metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
+        )
+        return response

    async def batch_chat_completion(
        self,
@ -292,7 +276,7 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.batch_chat_completion: {model_id=}, {len(messages_batch)=}, {sampling_params=}, {response_format=}, {logprobs=}",
        )
-        provider = self.routing_table.get_provider_impl(model_id)
+        provider = await self.routing_table.get_provider_impl(model_id)
        return await provider.batch_chat_completion(
            model_id=model_id,
            messages_batch=messages_batch,
@ -319,10 +303,10 @@ class InferenceRouter(Inference):
        )
        model = await self.routing_table.get_model(model_id)
        if model is None:
-            raise ValueError(f"Model '{model_id}' not found")
+            raise ModelNotFoundError(model_id)
        if model.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
-        provider = self.routing_table.get_provider_impl(model_id)
+        provider = await self.routing_table.get_provider_impl(model_id)
        params = dict(
            model_id=model_id,
            content=content,
@ -333,39 +317,20 @@ class InferenceRouter(Inference):
        )

        prompt_tokens = await self._count_tokens(content)
-
+        response = await provider.completion(**params)
        if stream:
-
-            async def stream_generator():
-                completion_text = ""
-                async for chunk in await provider.completion(**params):
-                    if hasattr(chunk, "delta"):
-                        completion_text += chunk.delta
-                    if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
-                        completion_tokens = await self._count_tokens(completion_text)
-                        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-                        metrics = await self._compute_and_log_token_usage(
-                            prompt_tokens or 0,
-                            completion_tokens or 0,
-                            total_tokens,
-                            model,
-                        )
-                        chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics
-                    yield chunk
-
-            return stream_generator()
-        else:
-            response = await provider.completion(**params)
-            completion_tokens = await self._count_tokens(response.content)
-            total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-            metrics = await self._compute_and_log_token_usage(
-                prompt_tokens or 0,
-                completion_tokens or 0,
-                total_tokens,
-                model,
+            return self.stream_tokens_and_compute_metrics(
+                response=response,
+                prompt_tokens=prompt_tokens,
+                model=model,
            )
-            response.metrics = metrics if response.metrics is None else response.metrics + metrics
-            return response
+
+        metrics = await self.count_tokens_and_compute_metrics(
+            response=response, prompt_tokens=prompt_tokens, model=model
+        )
+        response.metrics = metrics if response.metrics is None else response.metrics + metrics
+
+        return response

    async def batch_completion(
        self,
@ -378,7 +343,7 @@ class InferenceRouter(Inference):
        logger.debug(
            f"InferenceRouter.batch_completion: {model_id=}, {len(content_batch)=}, {sampling_params=}, {response_format=}, {logprobs=}",
        )
-        provider = self.routing_table.get_provider_impl(model_id)
+        provider = await self.routing_table.get_provider_impl(model_id)
        return await provider.batch_completion(model_id, content_batch, sampling_params, response_format, logprobs)

    async def embeddings(
@ -392,10 +357,11 @@ class InferenceRouter(Inference):
        logger.debug(f"InferenceRouter.embeddings: {model_id}")
        model = await self.routing_table.get_model(model_id)
        if model is None:
-            raise ValueError(f"Model '{model_id}' not found")
+            raise ModelNotFoundError(model_id)
        if model.model_type == ModelType.llm:
            raise ValueError(f"Model '{model_id}' is an LLM model and does not support embeddings")
-        return await self.routing_table.get_provider_impl(model_id).embeddings(
+        provider = await self.routing_table.get_provider_impl(model_id)
+        return await provider.embeddings(
            model_id=model_id,
            contents=contents,
            text_truncation=text_truncation,
@ -431,7 +397,7 @@ class InferenceRouter(Inference):
        )
        model_obj = await self.routing_table.get_model(model)
        if model_obj is None:
-            raise ValueError(f"Model '{model}' not found")
+            raise ModelNotFoundError(model)
        if model_obj.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model}' is an embedding model and does not support completions")

@ -457,9 +423,29 @@ class InferenceRouter(Inference):
            prompt_logprobs=prompt_logprobs,
            suffix=suffix,
        )
+        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
+        if stream:
+            return await provider.openai_completion(**params)
+            # TODO: Metrics do NOT work with openai_completion stream=True due to the fact
+            # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.
+            # response_stream = await provider.openai_completion(**params)

-        provider = self.routing_table.get_provider_impl(model_obj.identifier)
-        return await provider.openai_completion(**params)
+        response = await provider.openai_completion(**params)
+        if self.telemetry:
+            metrics = self._construct_metrics(
+                prompt_tokens=response.usage.prompt_tokens,
+                completion_tokens=response.usage.completion_tokens,
+                total_tokens=response.usage.total_tokens,
+                model=model_obj,
+            )
+            for metric in metrics:
+                await self.telemetry.log_event(metric)
+
+            # these metrics will show up in the client response.
+            response.metrics = (
+                metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
+            )
+        return response

    async def openai_chat_completion(
        self,
@ -492,7 +478,7 @@ class InferenceRouter(Inference):
        )
        model_obj = await self.routing_table.get_model(model)
        if model_obj is None:
-            raise ValueError(f"Model '{model}' not found")
+            raise ModelNotFoundError(model)
        if model_obj.model_type == ModelType.embedding:
            raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")

@ -537,18 +523,38 @@ class InferenceRouter(Inference):
            top_p=top_p,
            user=user,
        )
-
-        provider = self.routing_table.get_provider_impl(model_obj.identifier)
+        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
        if stream:
            response_stream = await provider.openai_chat_completion(**params)
-            if self.store:
-                return stream_and_store_openai_completion(response_stream, model, self.store, messages)
-            return response_stream
-        else:
-            response = await self._nonstream_openai_chat_completion(provider, params)
-            if self.store:
-                await self.store.store_chat_completion(response, messages)
-            return response
+
+            # For streaming, the provider returns AsyncIterator[OpenAIChatCompletionChunk]
+            # We need to add metrics to each chunk and store the final completion
+            return self.stream_tokens_and_compute_metrics_openai_chat(
+                response=response_stream,
+                model=model_obj,
+                messages=messages,
+            )
+
+        response = await self._nonstream_openai_chat_completion(provider, params)
+
+        # Store the response with the ID that will be returned to the client
+        if self.store:
+            await self.store.store_chat_completion(response, messages)
+
+        if self.telemetry:
+            metrics = self._construct_metrics(
+                prompt_tokens=response.usage.prompt_tokens,
+                completion_tokens=response.usage.completion_tokens,
+                total_tokens=response.usage.total_tokens,
+                model=model_obj,
+            )
+            for metric in metrics:
+                await self.telemetry.log_event(metric)
+            # these metrics will show up in the client response.
+            response.metrics = (
+                metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
+            )
+        return response

    async def openai_embeddings(
        self,
@ -563,7 +569,7 @@ class InferenceRouter(Inference):
        )
        model_obj = await self.routing_table.get_model(model)
        if model_obj is None:
-            raise ValueError(f"Model '{model}' not found")
+            raise ModelNotFoundError(model)
        if model_obj.model_type != ModelType.embedding:
            raise ValueError(f"Model '{model}' is not an embedding model")

@ -575,7 +581,7 @@ class InferenceRouter(Inference):
            user=user,
        )

-        provider = self.routing_table.get_provider_impl(model_obj.identifier)
+        provider = await self.routing_table.get_provider_impl(model_obj.identifier)
        return await provider.openai_embeddings(**params)

    async def list_chat_completions(
@ -625,3 +631,244 @@ class InferenceRouter(Inference):
                    status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}"
                )
        return health_statuses
+
+    async def stream_tokens_and_compute_metrics(
+        self,
+        response,
+        prompt_tokens,
+        model,
+        tool_prompt_format: ToolPromptFormat | None = None,
+    ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None] | AsyncGenerator[CompletionResponseStreamChunk, None]:
+        completion_text = ""
+        async for chunk in response:
+            complete = False
+            if hasattr(chunk, "event"):  # only ChatCompletions have .event
+                if chunk.event.event_type == ChatCompletionResponseEventType.progress:
+                    if chunk.event.delta.type == "text":
+                        completion_text += chunk.event.delta.text
+                if chunk.event.event_type == ChatCompletionResponseEventType.complete:
+                    complete = True
+                    completion_tokens = await self._count_tokens(
+                        [
+                            CompletionMessage(
+                                content=completion_text,
+                                stop_reason=StopReason.end_of_turn,
+                            )
+                        ],
+                        tool_prompt_format=tool_prompt_format,
+                    )
+            else:
+                if hasattr(chunk, "delta"):
+                    completion_text += chunk.delta
+                if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
+                    complete = True
+                    completion_tokens = await self._count_tokens(completion_text)
+            # if we are done receiving tokens
+            if complete:
+                total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
+
+                # Create a separate span for streaming completion metrics
+                if self.telemetry:
+                    # Log metrics in the new span context
+                    completion_metrics = self._construct_metrics(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=total_tokens,
+                        model=model,
+                    )
+                    for metric in completion_metrics:
+                        if metric.metric in [
+                            "completion_tokens",
+                            "total_tokens",
+                        ]:  # Only log completion and total tokens
+                            await self.telemetry.log_event(metric)
+
+                        # Return metrics in response
+                        async_metrics = [
+                            MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
+                        ]
+                        chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
+                else:
+                    # Fallback if no telemetry
+                    completion_metrics = self._construct_metrics(
+                        prompt_tokens or 0,
+                        completion_tokens or 0,
+                        total_tokens,
+                        model,
+                    )
+                    async_metrics = [
+                        MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
+                    ]
+                    chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
+            yield chunk
+
+    async def count_tokens_and_compute_metrics(
+        self,
+        response: ChatCompletionResponse | CompletionResponse,
+        prompt_tokens,
+        model,
+        tool_prompt_format: ToolPromptFormat | None = None,
+    ):
+        if isinstance(response, ChatCompletionResponse):
+            content = [response.completion_message]
+        else:
+            content = response.content
+        completion_tokens = await self._count_tokens(messages=content, tool_prompt_format=tool_prompt_format)
+        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
+
+        # Create a separate span for completion metrics
+        if self.telemetry:
+            # Log metrics in the new span context
+            completion_metrics = self._construct_metrics(
+                prompt_tokens=prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=total_tokens,
+                model=model,
+            )
+            for metric in completion_metrics:
+                if metric.metric in ["completion_tokens", "total_tokens"]:  # Only log completion and total tokens
+                    await self.telemetry.log_event(metric)
+
+            # Return metrics in response
+            return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
+
+        # Fallback if no telemetry
+        metrics = self._construct_metrics(
+            prompt_tokens or 0,
+            completion_tokens or 0,
+            total_tokens,
+            model,
+        )
+        return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
+
+    async def stream_tokens_and_compute_metrics_openai_chat(
+        self,
+        response: AsyncIterator[OpenAIChatCompletionChunk],
+        model: Model,
+        messages: list[OpenAIMessageParam] | None = None,
+    ) -> AsyncIterator[OpenAIChatCompletionChunk]:
+        """Stream OpenAI chat completion chunks, compute metrics, and store the final completion."""
+        id = None
+        created = None
+        choices_data: dict[int, dict[str, Any]] = {}
+
+        try:
+            async for chunk in response:
+                # Skip None chunks
+                if chunk is None:
+                    continue
+
+                # Capture ID and created timestamp from first chunk
+                if id is None and chunk.id:
+                    id = chunk.id
+                if created is None and chunk.created:
+                    created = chunk.created
+
+                # Accumulate choice data for final assembly
+                if chunk.choices:
+                    for choice_delta in chunk.choices:
+                        idx = choice_delta.index
+                        if idx not in choices_data:
+                            choices_data[idx] = {
+                                "content_parts": [],
+                                "tool_calls_builder": {},
+                                "finish_reason": None,
+                                "logprobs_content_parts": [],
+                            }
+                        current_choice_data = choices_data[idx]
+
+                        if choice_delta.delta:
+                            delta = choice_delta.delta
+                            if delta.content:
+                                current_choice_data["content_parts"].append(delta.content)
+                            if delta.tool_calls:
+                                for tool_call_delta in delta.tool_calls:
+                                    tc_idx = tool_call_delta.index
+                                    if tc_idx not in current_choice_data["tool_calls_builder"]:
+                                        current_choice_data["tool_calls_builder"][tc_idx] = {
+                                            "id": None,
+                                            "type": "function",
+                                            "function_name_parts": [],
+                                            "function_arguments_parts": [],
+                                        }
+                                    builder = current_choice_data["tool_calls_builder"][tc_idx]
+                                    if tool_call_delta.id:
+                                        builder["id"] = tool_call_delta.id
+                                    if tool_call_delta.type:
+                                        builder["type"] = tool_call_delta.type
+                                    if tool_call_delta.function:
+                                        if tool_call_delta.function.name:
+                                            builder["function_name_parts"].append(tool_call_delta.function.name)
+                                        if tool_call_delta.function.arguments:
+                                            builder["function_arguments_parts"].append(
+                                                tool_call_delta.function.arguments
+                                            )
+                        if choice_delta.finish_reason:
+                            current_choice_data["finish_reason"] = choice_delta.finish_reason
+                        if choice_delta.logprobs and choice_delta.logprobs.content:
+                            current_choice_data["logprobs_content_parts"].extend(choice_delta.logprobs.content)
+
+                # Compute metrics on final chunk
+                if chunk.choices and chunk.choices[0].finish_reason:
+                    completion_text = ""
+                    for choice_data in choices_data.values():
+                        completion_text += "".join(choice_data["content_parts"])
+
+                    # Add metrics to the chunk
+                    if self.telemetry and chunk.usage:
+                        metrics = self._construct_metrics(
+                            prompt_tokens=chunk.usage.prompt_tokens,
+                            completion_tokens=chunk.usage.completion_tokens,
+                            total_tokens=chunk.usage.total_tokens,
+                            model=model,
+                        )
+                        for metric in metrics:
+                            await self.telemetry.log_event(metric)
+
+                yield chunk
+        finally:
+            # Store the final assembled completion
+            if id and self.store and messages:
+                assembled_choices: list[OpenAIChoice] = []
+                for choice_idx, choice_data in choices_data.items():
+                    content_str = "".join(choice_data["content_parts"])
+                    assembled_tool_calls: list[OpenAIChatCompletionToolCall] = []
+                    if choice_data["tool_calls_builder"]:
+                        for tc_build_data in choice_data["tool_calls_builder"].values():
+                            if tc_build_data["id"]:
+                                func_name = "".join(tc_build_data["function_name_parts"])
+                                func_args = "".join(tc_build_data["function_arguments_parts"])
+                                assembled_tool_calls.append(
+                                    OpenAIChatCompletionToolCall(
+                                        id=tc_build_data["id"],
+                                        type=tc_build_data["type"],
+                                        function=OpenAIChatCompletionToolCallFunction(
+                                            name=func_name, arguments=func_args
+                                        ),
+                                    )
+                                )
+                    message = OpenAIAssistantMessageParam(
+                        role="assistant",
+                        content=content_str if content_str else None,
+                        tool_calls=assembled_tool_calls if assembled_tool_calls else None,
+                    )
+                    logprobs_content = choice_data["logprobs_content_parts"]
+                    final_logprobs = OpenAIChoiceLogprobs(content=logprobs_content) if logprobs_content else None
+
+                    assembled_choices.append(
+                        OpenAIChoice(
+                            finish_reason=choice_data["finish_reason"],
+                            index=choice_idx,
+                            message=message,
+                            logprobs=final_logprobs,
+                        )
+                    )
+
+                final_response = OpenAIChatCompletion(
+                    id=id,
+                    choices=assembled_choices,
+                    created=created or int(time.time()),
+                    model=model.identifier,
+                    object="chat.completion",
+                )
+                await self.store.store_chat_completion(final_response, messages)
--- a/llama_stack/core/routers/safety.py
+++ b/llama_stack/core/routers/safety.py
@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.inference import (
+    Message,
+)
+from llama_stack.apis.safety import RunShieldResponse, Safety
+from llama_stack.apis.safety.safety import ModerationObject, OpenAICategories
+from llama_stack.apis.shields import Shield
+from llama_stack.log import get_logger
+from llama_stack.providers.datatypes import RoutingTable
+
+logger = get_logger(name=__name__, category="core")
+
+
+class SafetyRouter(Safety):
+    def __init__(
+        self,
+        routing_table: RoutingTable,
+    ) -> None:
+        logger.debug("Initializing SafetyRouter")
+        self.routing_table = routing_table
+
+    async def initialize(self) -> None:
+        logger.debug("SafetyRouter.initialize")
+        pass
+
+    async def shutdown(self) -> None:
+        logger.debug("SafetyRouter.shutdown")
+        pass
+
+    async def register_shield(
+        self,
+        shield_id: str,
+        provider_shield_id: str | None = None,
+        provider_id: str | None = None,
+        params: dict[str, Any] | None = None,
+    ) -> Shield:
+        logger.debug(f"SafetyRouter.register_shield: {shield_id}")
+        return await self.routing_table.register_shield(shield_id, provider_shield_id, provider_id, params)
+
+    async def unregister_shield(self, identifier: str) -> None:
+        logger.debug(f"SafetyRouter.unregister_shield: {identifier}")
+        return await self.routing_table.unregister_shield(identifier)
+
+    async def run_shield(
+        self,
+        shield_id: str,
+        messages: list[Message],
+        params: dict[str, Any] = None,
+    ) -> RunShieldResponse:
+        logger.debug(f"SafetyRouter.run_shield: {shield_id}")
+        provider = await self.routing_table.get_provider_impl(shield_id)
+        return await provider.run_shield(
+            shield_id=shield_id,
+            messages=messages,
+            params=params,
+        )
+
+    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
+        async def get_shield_id(self, model: str) -> str:
+            """Get Shield id from model (provider_resource_id) of shield."""
+            list_shields_response = await self.routing_table.list_shields()
+
+            matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id]
+            if not matches:
+                raise ValueError(f"No shield associated with provider_resource id {model}")
+            if len(matches) > 1:
+                raise ValueError(f"Multiple shields associated with provider_resource id {model}")
+            return matches[0]
+
+        shield_id = await get_shield_id(self, model)
+        logger.debug(f"SafetyRouter.run_moderation: {shield_id}")
+        provider = await self.routing_table.get_provider_impl(shield_id)
+
+        response = await provider.run_moderation(
+            input=input,
+            model=model,
+        )
+        self._validate_required_categories_exist(response)
+
+        return response
+
+    def _validate_required_categories_exist(self, response: ModerationObject) -> None:
+        """Validate the ProviderImpl response contains the required Open AI moderations categories."""
+        required_categories = list(map(str, OpenAICategories))
+
+        categories = response.results[0].categories
+        category_applied_input_types = response.results[0].category_applied_input_types
+        category_scores = response.results[0].category_scores
+
+        for i in [categories, category_applied_input_types, category_scores]:
+            if not set(required_categories).issubset(set(i.keys())):
+                raise ValueError(
+                    f"ProviderImpl response is missing required categories: {set(required_categories) - set(i.keys())}"
+                )
--- a/llama_stack/distribution/routers/tool_runtime.py
+++ b/llama_stack/distribution/routers/tool_runtime.py
@ -41,9 +41,8 @@ class ToolRuntimeRouter(ToolRuntime):
            query_config: RAGQueryConfig | None = None,
        ) -> RAGQueryResult:
            logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_db_ids}")
-            return await self.routing_table.get_provider_impl("knowledge_search").query(
-                content, vector_db_ids, query_config
-            )
+            provider = await self.routing_table.get_provider_impl("knowledge_search")
+            return await provider.query(content, vector_db_ids, query_config)

        async def insert(
            self,
@ -54,9 +53,8 @@ class ToolRuntimeRouter(ToolRuntime):
            logger.debug(
                f"ToolRuntimeRouter.RagToolImpl.insert: {vector_db_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
            )
-            return await self.routing_table.get_provider_impl("insert_into_memory").insert(
-                documents, vector_db_id, chunk_size_in_tokens
-            )
+            provider = await self.routing_table.get_provider_impl("insert_into_memory")
+            return await provider.insert(documents, vector_db_id, chunk_size_in_tokens)

    def __init__(
        self,
@ -80,7 +78,8 @@ class ToolRuntimeRouter(ToolRuntime):

    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> Any:
        logger.debug(f"ToolRuntimeRouter.invoke_tool: {tool_name}")
-        return await self.routing_table.get_provider_impl(tool_name).invoke_tool(
+        provider = await self.routing_table.get_provider_impl(tool_name)
+        return await provider.invoke_tool(
            tool_name=tool_name,
            kwargs=kwargs,
        )
--- a/llama_stack/distribution/routers/vector_io.py
+++ b/llama_stack/distribution/routers/vector_io.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 import asyncio
+import uuid
 from typing import Any

 from llama_stack.apis.common.content_types import (
@ -81,6 +82,7 @@ class VectorIORouter(VectorIO):
        embedding_model: str,
        embedding_dimension: int | None = 384,
        provider_id: str | None = None,
+        vector_db_name: str | None = None,
        provider_vector_db_id: str | None = None,
    ) -> None:
        logger.debug(f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}")
@ -89,6 +91,7 @@ class VectorIORouter(VectorIO):
            embedding_model,
            embedding_dimension,
            provider_id,
+            vector_db_name,
            provider_vector_db_id,
        )

@ -101,7 +104,8 @@ class VectorIORouter(VectorIO):
        logger.debug(
            f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, ttl_seconds={ttl_seconds}, chunk_ids={[chunk.metadata['document_id'] for chunk in chunks[:3]]}{' and more...' if len(chunks) > 3 else ''}",
        )
-        return await self.routing_table.get_provider_impl(vector_db_id).insert_chunks(vector_db_id, chunks, ttl_seconds)
+        provider = await self.routing_table.get_provider_impl(vector_db_id)
+        return await provider.insert_chunks(vector_db_id, chunks, ttl_seconds)

    async def query_chunks(
        self,
@ -110,7 +114,8 @@ class VectorIORouter(VectorIO):
        params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
        logger.debug(f"VectorIORouter.query_chunks: {vector_db_id}")
-        return await self.routing_table.get_provider_impl(vector_db_id).query_chunks(vector_db_id, query, params)
+        provider = await self.routing_table.get_provider_impl(vector_db_id)
+        return await provider.query_chunks(vector_db_id, query, params)

    # OpenAI Vector Stores API endpoints
    async def openai_create_vector_store(
@ -123,7 +128,6 @@ class VectorIORouter(VectorIO):
        embedding_model: str | None = None,
        embedding_dimension: int | None = None,
        provider_id: str | None = None,
-        provider_vector_db_id: str | None = None,
    ) -> VectorStoreObject:
        logger.debug(f"VectorIORouter.openai_create_vector_store: name={name}, provider_id={provider_id}")

@ -135,17 +139,18 @@ class VectorIORouter(VectorIO):
            embedding_model, embedding_dimension = embedding_model_info
            logger.info(f"No embedding model specified, using first available: {embedding_model}")

-        vector_db_id = name
+        vector_db_id = f"vs_{uuid.uuid4()}"
        registered_vector_db = await self.routing_table.register_vector_db(
-            vector_db_id,
-            embedding_model,
-            embedding_dimension,
-            provider_id,
-            provider_vector_db_id,
+            vector_db_id=vector_db_id,
+            embedding_model=embedding_model,
+            embedding_dimension=embedding_dimension,
+            provider_id=provider_id,
+            provider_vector_db_id=vector_db_id,
+            vector_db_name=name,
        )
-
-        return await self.routing_table.get_provider_impl(registered_vector_db.identifier).openai_create_vector_store(
-            vector_db_id,
+        provider = await self.routing_table.get_provider_impl(registered_vector_db.identifier)
+        return await provider.openai_create_vector_store(
+            name=name,
            file_ids=file_ids,
            expires_after=expires_after,
            chunking_strategy=chunking_strategy,
@ -170,9 +175,8 @@ class VectorIORouter(VectorIO):
        all_stores = []
        for vector_db in vector_dbs:
            try:
-                vector_store = await self.routing_table.get_provider_impl(
-                    vector_db.identifier
-                ).openai_retrieve_vector_store(vector_db.identifier)
+                provider = await self.routing_table.get_provider_impl(vector_db.identifier)
+                vector_store = await provider.openai_retrieve_vector_store(vector_db.identifier)
                all_stores.append(vector_store)
            except Exception as e:
                logger.error(f"Error retrieving vector store {vector_db.identifier}: {e}")
@ -212,9 +216,7 @@ class VectorIORouter(VectorIO):
        vector_store_id: str,
    ) -> VectorStoreObject:
        logger.debug(f"VectorIORouter.openai_retrieve_vector_store: {vector_store_id}")
-        # Route based on vector store ID
-        provider = self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_retrieve_vector_store(vector_store_id)
+        return await self.routing_table.openai_retrieve_vector_store(vector_store_id)

    async def openai_update_vector_store(
        self,
@ -224,9 +226,7 @@ class VectorIORouter(VectorIO):
        metadata: dict[str, Any] | None = None,
    ) -> VectorStoreObject:
        logger.debug(f"VectorIORouter.openai_update_vector_store: {vector_store_id}")
-        # Route based on vector store ID
-        provider = self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_update_vector_store(
+        return await self.routing_table.openai_update_vector_store(
            vector_store_id=vector_store_id,
            name=name,
            expires_after=expires_after,
@ -238,12 +238,7 @@ class VectorIORouter(VectorIO):
        vector_store_id: str,
    ) -> VectorStoreDeleteResponse:
        logger.debug(f"VectorIORouter.openai_delete_vector_store: {vector_store_id}")
-        # Route based on vector store ID
-        provider = self.routing_table.get_provider_impl(vector_store_id)
-        result = await provider.openai_delete_vector_store(vector_store_id)
-        # drop from registry
-        await self.routing_table.unregister_vector_db(vector_store_id)
-        return result
+        return await self.routing_table.openai_delete_vector_store(vector_store_id)

    async def openai_search_vector_store(
        self,
@ -256,9 +251,7 @@ class VectorIORouter(VectorIO):
        search_mode: str | None = "vector",
    ) -> VectorStoreSearchResponsePage:
        logger.debug(f"VectorIORouter.openai_search_vector_store: {vector_store_id}")
-        # Route based on vector store ID
-        provider = self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_search_vector_store(
+        return await self.routing_table.openai_search_vector_store(
            vector_store_id=vector_store_id,
            query=query,
            filters=filters,
@ -276,9 +269,7 @@ class VectorIORouter(VectorIO):
        chunking_strategy: VectorStoreChunkingStrategy | None = None,
    ) -> VectorStoreFileObject:
        logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
-        # Route based on vector store ID
-        provider = self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_attach_file_to_vector_store(
+        return await self.routing_table.openai_attach_file_to_vector_store(
            vector_store_id=vector_store_id,
            file_id=file_id,
            attributes=attributes,
@ -295,9 +286,7 @@ class VectorIORouter(VectorIO):
        filter: VectorStoreFileStatus | None = None,
    ) -> list[VectorStoreFileObject]:
        logger.debug(f"VectorIORouter.openai_list_files_in_vector_store: {vector_store_id}")
-        # Route based on vector store ID
-        provider = self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_list_files_in_vector_store(
+        return await self.routing_table.openai_list_files_in_vector_store(
            vector_store_id=vector_store_id,
            limit=limit,
            order=order,
@ -312,9 +301,7 @@ class VectorIORouter(VectorIO):
        file_id: str,
    ) -> VectorStoreFileObject:
        logger.debug(f"VectorIORouter.openai_retrieve_vector_store_file: {vector_store_id}, {file_id}")
-        # Route based on vector store ID
-        provider = self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_retrieve_vector_store_file(
+        return await self.routing_table.openai_retrieve_vector_store_file(
            vector_store_id=vector_store_id,
            file_id=file_id,
        )
@ -325,9 +312,7 @@ class VectorIORouter(VectorIO):
        file_id: str,
    ) -> VectorStoreFileContentsResponse:
        logger.debug(f"VectorIORouter.openai_retrieve_vector_store_file_contents: {vector_store_id}, {file_id}")
-        # Route based on vector store ID
-        provider = self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_retrieve_vector_store_file_contents(
+        return await self.routing_table.openai_retrieve_vector_store_file_contents(
            vector_store_id=vector_store_id,
            file_id=file_id,
        )
@ -339,9 +324,7 @@ class VectorIORouter(VectorIO):
        attributes: dict[str, Any],
    ) -> VectorStoreFileObject:
        logger.debug(f"VectorIORouter.openai_update_vector_store_file: {vector_store_id}, {file_id}")
-        # Route based on vector store ID
-        provider = self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_update_vector_store_file(
+        return await self.routing_table.openai_update_vector_store_file(
            vector_store_id=vector_store_id,
            file_id=file_id,
            attributes=attributes,
@ -353,9 +336,7 @@ class VectorIORouter(VectorIO):
        file_id: str,
    ) -> VectorStoreFileDeleteResponse:
        logger.debug(f"VectorIORouter.openai_delete_vector_store_file: {vector_store_id}, {file_id}")
-        # Route based on vector store ID
-        provider = self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_delete_vector_store_file(
+        return await self.routing_table.openai_delete_vector_store_file(
            vector_store_id=vector_store_id,
            file_id=file_id,
        )
--- a/llama_stack/distribution/routing_tables/init.py
+++ b/llama_stack/distribution/routing_tables/init.py
--- a/llama_stack/distribution/routing_tables/benchmarks.py
+++ b/llama_stack/distribution/routing_tables/benchmarks.py
@ -7,7 +7,7 @@
 from typing import Any

 from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
-from llama_stack.distribution.datatypes import (
+from llama_stack.core.datatypes import (
    BenchmarkWithOwner,
 )
 from llama_stack.log import get_logger
--- a/llama_stack/distribution/routing_tables/common.py
+++ b/llama_stack/distribution/routing_tables/common.py
@ -6,17 +6,20 @@

 from typing import Any

+from llama_stack.apis.common.errors import ModelNotFoundError
+from llama_stack.apis.models import Model
 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.scoring_functions import ScoringFn
-from llama_stack.distribution.access_control.access_control import AccessDeniedError, is_action_allowed
-from llama_stack.distribution.datatypes import (
+from llama_stack.core.access_control.access_control import AccessDeniedError, is_action_allowed
+from llama_stack.core.access_control.datatypes import Action
+from llama_stack.core.datatypes import (
    AccessRule,
    RoutableObject,
    RoutableObjectWithProvider,
    RoutedProtocol,
 )
-from llama_stack.distribution.request_headers import get_authenticated_user
-from llama_stack.distribution.store import DistributionRegistry
+from llama_stack.core.request_headers import get_authenticated_user
+from llama_stack.core.store import DistributionRegistry
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, RoutingTable

@ -57,6 +60,8 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
        return await p.unregister_vector_db(obj.identifier)
    elif api == Api.inference:
        return await p.unregister_model(obj.identifier)
+    elif api == Api.safety:
+        return await p.unregister_shield(obj.identifier)
    elif api == Api.datasetio:
        return await p.unregister_dataset(obj.identifier)
    elif api == Api.tool_runtime:
@ -115,7 +120,10 @@ class CommonRoutingTableImpl(RoutingTable):
        for p in self.impls_by_provider_id.values():
            await p.shutdown()

-    def get_provider_impl(self, routing_key: str, provider_id: str | None = None) -> Any:
+    async def refresh(self) -> None:
+        pass
+
+    async def get_provider_impl(self, routing_key: str, provider_id: str | None = None) -> Any:
        from .benchmarks import BenchmarksRoutingTable
        from .datasets import DatasetsRoutingTable
        from .models import ModelsRoutingTable
@ -204,11 +212,24 @@ class CommonRoutingTableImpl(RoutingTable):
        if obj.type == ResourceType.model.value:
            await self.dist_registry.register(registered_obj)
            return registered_obj
-
        else:
            await self.dist_registry.register(obj)
            return obj

+    async def assert_action_allowed(
+        self,
+        action: Action,
+        type: str,
+        identifier: str,
+    ) -> None:
+        """Fetch a registered object by type/identifier and enforce the given action permission."""
+        obj = await self.get_object_by_identifier(type, identifier)
+        if obj is None:
+            raise ValueError(f"{type.capitalize()} '{identifier}' not found")
+        user = get_authenticated_user()
+        if not is_action_allowed(self.policy, action, obj, user):
+            raise AccessDeniedError(action, obj, user)
+
    async def get_all_with_type(self, type: str) -> list[RoutableObjectWithProvider]:
        objs = await self.dist_registry.get_all()
        filtered_objs = [obj for obj in objs if obj.type == type]
@ -220,3 +241,28 @@ class CommonRoutingTableImpl(RoutingTable):
            ]

        return filtered_objs
+
+
+async def lookup_model(routing_table: CommonRoutingTableImpl, model_id: str) -> Model:
+    # first try to get the model by identifier
+    # this works if model_id is an alias or is of the form provider_id/provider_model_id
+    model = await routing_table.get_object_by_identifier("model", model_id)
+    if model is not None:
+        return model
+
+    logger.warning(
+        f"WARNING: model identifier '{model_id}' not found in routing table. Falling back to "
+        "searching in all providers. This is only for backwards compatibility and will stop working "
+        "soon. Migrate your calls to use fully scoped `provider_id/model_id` names."
+    )
+    # if not found, this means model_id is an unscoped provider_model_id, we need
+    # to iterate (given a lack of an efficient index on the KVStore)
+    models = await routing_table.get_all_with_type("model")
+    matching_models = [m for m in models if m.provider_resource_id == model_id]
+    if len(matching_models) == 0:
+        raise ModelNotFoundError(model_id)
+
+    if len(matching_models) > 1:
+        raise ValueError(f"Multiple providers found for '{model_id}': {[m.provider_id for m in matching_models]}")
+
+    return matching_models[0]
--- a/llama_stack/distribution/routing_tables/datasets.py
+++ b/llama_stack/distribution/routing_tables/datasets.py
@ -7,6 +7,7 @@
 import uuid
 from typing import Any

+from llama_stack.apis.common.errors import DatasetNotFoundError
 from llama_stack.apis.datasets import (
    Dataset,
    DatasetPurpose,
@ -18,7 +19,7 @@ from llama_stack.apis.datasets import (
    URIDataSource,
 )
 from llama_stack.apis.resource import ResourceType
-from llama_stack.distribution.datatypes import (
+from llama_stack.core.datatypes import (
    DatasetWithOwner,
 )
 from llama_stack.log import get_logger
@ -35,7 +36,7 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
    async def get_dataset(self, dataset_id: str) -> Dataset:
        dataset = await self.get_object_by_identifier("dataset", dataset_id)
        if dataset is None:
-            raise ValueError(f"Dataset '{dataset_id}' not found")
+            raise DatasetNotFoundError(dataset_id)
        return dataset

    async def register_dataset(
@ -87,6 +88,4 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):

    async def unregister_dataset(self, dataset_id: str) -> None:
        dataset = await self.get_dataset(dataset_id)
-        if dataset is None:
-            raise ValueError(f"Dataset {dataset_id} not found")
        await self.unregister_object(dataset)
--- a/llama_stack/core/routing_tables/models.py
+++ b/llama_stack/core/routing_tables/models.py
@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import time
+from typing import Any
+
+from llama_stack.apis.common.errors import ModelNotFoundError
+from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
+from llama_stack.core.datatypes import (
+    ModelWithOwner,
+    RegistryEntrySource,
+)
+from llama_stack.log import get_logger
+
+from .common import CommonRoutingTableImpl, lookup_model
+
+logger = get_logger(name=__name__, category="core")
+
+
+class ModelsRoutingTable(CommonRoutingTableImpl, Models):
+    listed_providers: set[str] = set()
+
+    async def refresh(self) -> None:
+        for provider_id, provider in self.impls_by_provider_id.items():
+            refresh = await provider.should_refresh_models()
+            refresh = refresh or provider_id not in self.listed_providers
+            if not refresh:
+                continue
+
+            try:
+                models = await provider.list_models()
+            except Exception as e:
+                logger.exception(f"Model refresh failed for provider {provider_id}: {e}")
+                continue
+
+            self.listed_providers.add(provider_id)
+            if models is None:
+                continue
+
+            await self.update_registered_models(provider_id, models)
+
+    async def list_models(self) -> ListModelsResponse:
+        return ListModelsResponse(data=await self.get_all_with_type("model"))
+
+    async def openai_list_models(self) -> OpenAIListModelsResponse:
+        models = await self.get_all_with_type("model")
+        openai_models = [
+            OpenAIModel(
+                id=model.identifier,
+                object="model",
+                created=int(time.time()),
+                owned_by="llama_stack",
+            )
+            for model in models
+        ]
+        return OpenAIListModelsResponse(data=openai_models)
+
+    async def get_model(self, model_id: str) -> Model:
+        return await lookup_model(self, model_id)
+
+    async def get_provider_impl(self, model_id: str) -> Any:
+        model = await lookup_model(self, model_id)
+        return self.impls_by_provider_id[model.provider_id]
+
+    async def register_model(
+        self,
+        model_id: str,
+        provider_model_id: str | None = None,
+        provider_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        model_type: ModelType | None = None,
+    ) -> Model:
+        if provider_id is None:
+            # If provider_id not specified, use the only provider if it supports this model
+            if len(self.impls_by_provider_id) == 1:
+                provider_id = list(self.impls_by_provider_id.keys())[0]
+            else:
+                raise ValueError(
+                    f"Please specify a provider_id for model {model_id} since multiple providers are available: {self.impls_by_provider_id.keys()}.\n\n"
+                    "Use the provider_id as a prefix to disambiguate, e.g. 'provider_id/model_id'."
+                )
+
+        provider_model_id = provider_model_id or model_id
+        metadata = metadata or {}
+        model_type = model_type or ModelType.llm
+        if "embedding_dimension" not in metadata and model_type == ModelType.embedding:
+            raise ValueError("Embedding model must have an embedding dimension in its metadata")
+
+        # an identifier different than provider_model_id implies it is an alias, so that
+        # becomes the globally unique identifier. otherwise provider_model_ids can conflict,
+        # so as a general rule we must use the provider_id to disambiguate.
+
+        if model_id != provider_model_id:
+            identifier = model_id
+        else:
+            identifier = f"{provider_id}/{provider_model_id}"
+
+        model = ModelWithOwner(
+            identifier=identifier,
+            provider_resource_id=provider_model_id,
+            provider_id=provider_id,
+            metadata=metadata,
+            model_type=model_type,
+            source=RegistryEntrySource.via_register_api,
+        )
+        registered_model = await self.register_object(model)
+        return registered_model
+
+    async def unregister_model(self, model_id: str) -> None:
+        existing_model = await self.get_model(model_id)
+        if existing_model is None:
+            raise ModelNotFoundError(model_id)
+        await self.unregister_object(existing_model)
+
+    async def update_registered_models(
+        self,
+        provider_id: str,
+        models: list[Model],
+    ) -> None:
+        existing_models = await self.get_all_with_type("model")
+
+        # we may have an alias for the model registered by the user (or during initialization
+        # from run.yaml) that we need to keep track of
+        model_ids = {}
+        for model in existing_models:
+            if model.provider_id != provider_id:
+                continue
+            if model.source == RegistryEntrySource.via_register_api:
+                model_ids[model.provider_resource_id] = model.identifier
+                continue
+
+            logger.debug(f"unregistering model {model.identifier}")
+            await self.unregister_object(model)
+
+        for model in models:
+            if model.provider_resource_id in model_ids:
+                # avoid overwriting a non-provider-registered model entry
+                continue
+
+            if model.identifier == model.provider_resource_id:
+                model.identifier = f"{provider_id}/{model.provider_resource_id}"
+
+            logger.debug(f"registering model {model.identifier} ({model.provider_resource_id})")
+            await self.register_object(
+                ModelWithOwner(
+                    identifier=model.identifier,
+                    provider_resource_id=model.provider_resource_id,
+                    provider_id=provider_id,
+                    metadata=model.metadata,
+                    model_type=model.model_type,
+                    source=RegistryEntrySource.listed_from_provider,
+                )
+            )
--- a/llama_stack/distribution/routing_tables/scoring_functions.py
+++ b/llama_stack/distribution/routing_tables/scoring_functions.py
@ -12,7 +12,7 @@ from llama_stack.apis.scoring_functions import (
    ScoringFnParams,
    ScoringFunctions,
 )
-from llama_stack.distribution.datatypes import (
+from llama_stack.core.datatypes import (
    ScoringFnWithOwner,
 )
 from llama_stack.log import get_logger
--- a/llama_stack/distribution/routing_tables/shields.py
+++ b/llama_stack/distribution/routing_tables/shields.py
@ -8,7 +8,7 @@ from typing import Any

 from llama_stack.apis.resource import ResourceType
 from llama_stack.apis.shields import ListShieldsResponse, Shield, Shields
-from llama_stack.distribution.datatypes import (
+from llama_stack.core.datatypes import (
    ShieldWithOwner,
 )
 from llama_stack.log import get_logger
@ -55,3 +55,7 @@ class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
        )
        await self.register_object(shield)
        return shield
+
+    async def unregister_shield(self, identifier: str) -> None:
+        existing_shield = await self.get_shield(identifier)
+        await self.unregister_object(existing_shield)
--- a/llama_stack/distribution/routing_tables/toolgroups.py
+++ b/llama_stack/distribution/routing_tables/toolgroups.py
@ -7,8 +7,9 @@
 from typing import Any

 from llama_stack.apis.common.content_types import URL
+from llama_stack.apis.common.errors import ToolGroupNotFoundError
 from llama_stack.apis.tools import ListToolGroupsResponse, ListToolsResponse, Tool, ToolGroup, ToolGroups
-from llama_stack.distribution.datatypes import ToolGroupWithOwner
+from llama_stack.core.datatypes import ToolGroupWithOwner
 from llama_stack.log import get_logger

 from .common import CommonRoutingTableImpl
@ -30,7 +31,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
    tool_to_toolgroup: dict[str, str] = {}

    # overridden
-    def get_provider_impl(self, routing_key: str, provider_id: str | None = None) -> Any:
+    async def get_provider_impl(self, routing_key: str, provider_id: str | None = None) -> Any:
        # we don't index tools in the registry anymore, but only keep a cache of them by toolgroup_id
        # TODO: we may want to invalidate the cache (for a given toolgroup_id) every once in a while?

@ -40,7 +41,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):

        if routing_key in self.tool_to_toolgroup:
            routing_key = self.tool_to_toolgroup[routing_key]
-        return super().get_provider_impl(routing_key, provider_id)
+        return await super().get_provider_impl(routing_key, provider_id)

    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolsResponse:
        if toolgroup_id:
@ -59,7 +60,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
        return ListToolsResponse(data=all_tools)

    async def _index_tools(self, toolgroup: ToolGroup):
-        provider_impl = super().get_provider_impl(toolgroup.identifier, toolgroup.provider_id)
+        provider_impl = await super().get_provider_impl(toolgroup.identifier, toolgroup.provider_id)
        tooldefs_response = await provider_impl.list_runtime_tools(toolgroup.identifier, toolgroup.mcp_endpoint)

        # TODO: kill this Tool vs ToolDef distinction
@ -87,7 +88,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
    async def get_tool_group(self, toolgroup_id: str) -> ToolGroup:
        tool_group = await self.get_object_by_identifier("tool_group", toolgroup_id)
        if tool_group is None:
-            raise ValueError(f"Tool group '{toolgroup_id}' not found")
+            raise ToolGroupNotFoundError(toolgroup_id)
        return tool_group

    async def get_tool(self, tool_name: str) -> Tool:
@ -125,7 +126,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
        tool_group = await self.get_tool_group(toolgroup_id)
        if tool_group is None:
-            raise ValueError(f"Tool group {toolgroup_id} not found")
+            raise ToolGroupNotFoundError(toolgroup_id)
        await self.unregister_object(tool_group)

    async def shutdown(self) -> None:
--- a/llama_stack/core/routing_tables/vector_dbs.py
+++ b/llama_stack/core/routing_tables/vector_dbs.py
@ -0,0 +1,229 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from pydantic import TypeAdapter
+
+from llama_stack.apis.common.errors import ModelNotFoundError, VectorStoreNotFoundError
+from llama_stack.apis.models import ModelType
+from llama_stack.apis.resource import ResourceType
+from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs
+from llama_stack.apis.vector_io.vector_io import (
+    SearchRankingOptions,
+    VectorStoreChunkingStrategy,
+    VectorStoreDeleteResponse,
+    VectorStoreFileContentsResponse,
+    VectorStoreFileDeleteResponse,
+    VectorStoreFileObject,
+    VectorStoreFileStatus,
+    VectorStoreObject,
+    VectorStoreSearchResponsePage,
+)
+from llama_stack.core.datatypes import (
+    VectorDBWithOwner,
+)
+from llama_stack.log import get_logger
+
+from .common import CommonRoutingTableImpl, lookup_model
+
+logger = get_logger(name=__name__, category="core")
+
+
+class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
+    async def list_vector_dbs(self) -> ListVectorDBsResponse:
+        return ListVectorDBsResponse(data=await self.get_all_with_type("vector_db"))
+
+    async def get_vector_db(self, vector_db_id: str) -> VectorDB:
+        vector_db = await self.get_object_by_identifier("vector_db", vector_db_id)
+        if vector_db is None:
+            raise VectorStoreNotFoundError(vector_db_id)
+        return vector_db
+
+    async def register_vector_db(
+        self,
+        vector_db_id: str,
+        embedding_model: str,
+        embedding_dimension: int | None = 384,
+        provider_id: str | None = None,
+        provider_vector_db_id: str | None = None,
+        vector_db_name: str | None = None,
+    ) -> VectorDB:
+        provider_vector_db_id = provider_vector_db_id or vector_db_id
+        if provider_id is None:
+            if len(self.impls_by_provider_id) > 0:
+                provider_id = list(self.impls_by_provider_id.keys())[0]
+                if len(self.impls_by_provider_id) > 1:
+                    logger.warning(
+                        f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}."
+                    )
+            else:
+                raise ValueError("No provider available. Please configure a vector_io provider.")
+        model = await lookup_model(self, embedding_model)
+        if model is None:
+            raise ModelNotFoundError(embedding_model)
+        if model.model_type != ModelType.embedding:
+            raise ValueError(f"Model {embedding_model} is not an embedding model")
+        if "embedding_dimension" not in model.metadata:
+            raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
+        vector_db_data = {
+            "identifier": vector_db_id,
+            "type": ResourceType.vector_db.value,
+            "provider_id": provider_id,
+            "provider_resource_id": provider_vector_db_id,
+            "embedding_model": embedding_model,
+            "embedding_dimension": model.metadata["embedding_dimension"],
+            "vector_db_name": vector_db_name,
+        }
+        vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
+        await self.register_object(vector_db)
+        return vector_db
+
+    async def unregister_vector_db(self, vector_db_id: str) -> None:
+        existing_vector_db = await self.get_vector_db(vector_db_id)
+        await self.unregister_object(existing_vector_db)
+
+    async def openai_retrieve_vector_store(
+        self,
+        vector_store_id: str,
+    ) -> VectorStoreObject:
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
+        provider = await self.get_provider_impl(vector_store_id)
+        return await provider.openai_retrieve_vector_store(vector_store_id)
+
+    async def openai_update_vector_store(
+        self,
+        vector_store_id: str,
+        name: str | None = None,
+        expires_after: dict[str, Any] | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> VectorStoreObject:
+        await self.assert_action_allowed("update", "vector_db", vector_store_id)
+        provider = await self.get_provider_impl(vector_store_id)
+        return await provider.openai_update_vector_store(
+            vector_store_id=vector_store_id,
+            name=name,
+            expires_after=expires_after,
+            metadata=metadata,
+        )
+
+    async def openai_delete_vector_store(
+        self,
+        vector_store_id: str,
+    ) -> VectorStoreDeleteResponse:
+        await self.assert_action_allowed("delete", "vector_db", vector_store_id)
+        provider = await self.get_provider_impl(vector_store_id)
+        result = await provider.openai_delete_vector_store(vector_store_id)
+        await self.unregister_vector_db(vector_store_id)
+        return result
+
+    async def openai_search_vector_store(
+        self,
+        vector_store_id: str,
+        query: str | list[str],
+        filters: dict[str, Any] | None = None,
+        max_num_results: int | None = 10,
+        ranking_options: SearchRankingOptions | None = None,
+        rewrite_query: bool | None = False,
+        search_mode: str | None = "vector",
+    ) -> VectorStoreSearchResponsePage:
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
+        provider = await self.get_provider_impl(vector_store_id)
+        return await provider.openai_search_vector_store(
+            vector_store_id=vector_store_id,
+            query=query,
+            filters=filters,
+            max_num_results=max_num_results,
+            ranking_options=ranking_options,
+            rewrite_query=rewrite_query,
+            search_mode=search_mode,
+        )
+
+    async def openai_attach_file_to_vector_store(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        attributes: dict[str, Any] | None = None,
+        chunking_strategy: VectorStoreChunkingStrategy | None = None,
+    ) -> VectorStoreFileObject:
+        await self.assert_action_allowed("update", "vector_db", vector_store_id)
+        provider = await self.get_provider_impl(vector_store_id)
+        return await provider.openai_attach_file_to_vector_store(
+            vector_store_id=vector_store_id,
+            file_id=file_id,
+            attributes=attributes,
+            chunking_strategy=chunking_strategy,
+        )
+
+    async def openai_list_files_in_vector_store(
+        self,
+        vector_store_id: str,
+        limit: int | None = 20,
+        order: str | None = "desc",
+        after: str | None = None,
+        before: str | None = None,
+        filter: VectorStoreFileStatus | None = None,
+    ) -> list[VectorStoreFileObject]:
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
+        provider = await self.get_provider_impl(vector_store_id)
+        return await provider.openai_list_files_in_vector_store(
+            vector_store_id=vector_store_id,
+            limit=limit,
+            order=order,
+            after=after,
+            before=before,
+            filter=filter,
+        )
+
+    async def openai_retrieve_vector_store_file(
+        self,
+        vector_store_id: str,
+        file_id: str,
+    ) -> VectorStoreFileObject:
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
+        provider = await self.get_provider_impl(vector_store_id)
+        return await provider.openai_retrieve_vector_store_file(
+            vector_store_id=vector_store_id,
+            file_id=file_id,
+        )
+
+    async def openai_retrieve_vector_store_file_contents(
+        self,
+        vector_store_id: str,
+        file_id: str,
+    ) -> VectorStoreFileContentsResponse:
+        await self.assert_action_allowed("read", "vector_db", vector_store_id)
+        provider = await self.get_provider_impl(vector_store_id)
+        return await provider.openai_retrieve_vector_store_file_contents(
+            vector_store_id=vector_store_id,
+            file_id=file_id,
+        )
+
+    async def openai_update_vector_store_file(
+        self,
+        vector_store_id: str,
+        file_id: str,
+        attributes: dict[str, Any],
+    ) -> VectorStoreFileObject:
+        await self.assert_action_allowed("update", "vector_db", vector_store_id)
+        provider = await self.get_provider_impl(vector_store_id)
+        return await provider.openai_update_vector_store_file(
+            vector_store_id=vector_store_id,
+            file_id=file_id,
+            attributes=attributes,
+        )
+
+    async def openai_delete_vector_store_file(
+        self,
+        vector_store_id: str,
+        file_id: str,
+    ) -> VectorStoreFileDeleteResponse:
+        await self.assert_action_allowed("delete", "vector_db", vector_store_id)
+        provider = await self.get_provider_impl(vector_store_id)
+        return await provider.openai_delete_vector_store_file(
+            vector_store_id=vector_store_id,
+            file_id=file_id,
+        )
--- a/llama_stack/distribution/server/init.py
+++ b/llama_stack/distribution/server/init.py
--- a/llama_stack/distribution/server/auth.py
+++ b/llama_stack/distribution/server/auth.py
@ -7,9 +7,12 @@
 import json

 import httpx
+from aiohttp import hdrs

-from llama_stack.distribution.datatypes import AuthenticationConfig
-from llama_stack.distribution.server.auth_providers import create_auth_provider
+from llama_stack.core.datatypes import AuthenticationConfig, User
+from llama_stack.core.request_headers import user_from_scope
+from llama_stack.core.server.auth_providers import create_auth_provider
+from llama_stack.core.server.routes import find_matching_route, initialize_route_impls
 from llama_stack.log import get_logger

 logger = get_logger(name=__name__, category="auth")
@ -78,12 +81,14 @@ class AuthenticationMiddleware:
    access resources that don't have access_attributes defined.
    """

-    def __init__(self, app, auth_config: AuthenticationConfig):
+    def __init__(self, app, auth_config: AuthenticationConfig, impls):
        self.app = app
+        self.impls = impls
        self.auth_provider = create_auth_provider(auth_config)

    async def __call__(self, scope, receive, send):
        if scope["type"] == "http":
+            # First, handle authentication
            headers = dict(scope.get("headers", []))
            auth_header = headers.get(b"authorization", b"").decode()

@ -121,15 +126,50 @@ class AuthenticationMiddleware:
                f"Authentication successful: {validation_result.principal} with {len(validation_result.attributes)} attributes"
            )

+            # Scope-based API access control
+            path = scope.get("path", "")
+            method = scope.get("method", hdrs.METH_GET)
+
+            if not hasattr(self, "route_impls"):
+                self.route_impls = initialize_route_impls(self.impls)
+
+            try:
+                _, _, _, webmethod = find_matching_route(method, path, self.route_impls)
+            except ValueError:
+                # If no matching endpoint is found, pass through to FastAPI
+                return await self.app(scope, receive, send)
+
+            if webmethod.required_scope:
+                user = user_from_scope(scope)
+                if not _has_required_scope(webmethod.required_scope, user):
+                    return await self._send_auth_error(
+                        send,
+                        f"Access denied: user does not have required scope: {webmethod.required_scope}",
+                        status=403,
+                    )
+
        return await self.app(scope, receive, send)

-    async def _send_auth_error(self, send, message):
+    async def _send_auth_error(self, send, message, status=401):
        await send(
            {
                "type": "http.response.start",
-                "status": 401,
+                "status": status,
                "headers": [[b"content-type", b"application/json"]],
            }
        )
-        error_msg = json.dumps({"error": {"message": message}}).encode()
+        error_key = "message" if status == 401 else "detail"
+        error_msg = json.dumps({"error": {error_key: message}}).encode()
        await send({"type": "http.response.body", "body": error_msg})
+
+
+def _has_required_scope(required_scope: str, user: User | None) -> bool:
+    # if no user, assume auth is not enabled
+    if not user:
+        return True
+
+    if not user.attributes:
+        return False
+
+    user_scopes = user.attributes.get("scopes", [])
+    return required_scope in user_scopes
--- a/llama_stack/distribution/server/auth_providers.py
+++ b/llama_stack/distribution/server/auth_providers.py
@ -14,7 +14,7 @@ import httpx
 from jose import jwt
 from pydantic import BaseModel, Field

-from llama_stack.distribution.datatypes import (
+from llama_stack.core.datatypes import (
    AuthenticationConfig,
    CustomAuthConfig,
    GitHubTokenAuthConfig,
--- a/llama_stack/distribution/server/quota.py
+++ b/llama_stack/distribution/server/quota.py
--- a/llama_stack/distribution/server/routes.py
+++ b/llama_stack/distribution/server/routes.py
@ -12,17 +12,18 @@ from typing import Any
 from aiohttp import hdrs
 from starlette.routing import Route

+from llama_stack.apis.datatypes import Api, ExternalApiSpec
 from llama_stack.apis.tools import RAGToolRuntime, SpecialToolGroup
 from llama_stack.apis.version import LLAMA_STACK_API_VERSION
-from llama_stack.distribution.resolver import api_protocol_map
-from llama_stack.providers.datatypes import Api
+from llama_stack.core.resolver import api_protocol_map
+from llama_stack.schema_utils import WebMethod

 EndpointFunc = Callable[..., Any]
 PathParams = dict[str, str]
-RouteInfo = tuple[EndpointFunc, str]
+RouteInfo = tuple[EndpointFunc, str, WebMethod]
 PathImpl = dict[str, RouteInfo]
 RouteImpls = dict[str, PathImpl]
-RouteMatch = tuple[EndpointFunc, PathParams, str]
+RouteMatch = tuple[EndpointFunc, PathParams, str, WebMethod]


 def toolgroup_protocol_map():
@ -31,10 +32,12 @@ def toolgroup_protocol_map():
    }


-def get_all_api_routes() -> dict[Api, list[Route]]:
+def get_all_api_routes(
+    external_apis: dict[Api, ExternalApiSpec] | None = None,
+) -> dict[Api, list[tuple[Route, WebMethod]]]:
    apis = {}

-    protocols = api_protocol_map()
+    protocols = api_protocol_map(external_apis)
    toolgroup_protocols = toolgroup_protocol_map()
    for api, protocol in protocols.items():
        routes = []
@ -65,7 +68,7 @@ def get_all_api_routes() -> dict[Api, list[Route]]:
            else:
                http_method = hdrs.METH_POST
            routes.append(
-                Route(path=path, methods=[http_method], name=name, endpoint=None)
+                (Route(path=path, methods=[http_method], name=name, endpoint=None), webmethod)
            )  # setting endpoint to None since don't use a Router object

        apis[api] = routes
@ -73,8 +76,8 @@ def get_all_api_routes() -> dict[Api, list[Route]]:
    return apis


-def initialize_route_impls(impls: dict[Api, Any]) -> RouteImpls:
-    routes = get_all_api_routes()
+def initialize_route_impls(impls, external_apis: dict[Api, ExternalApiSpec] | None = None) -> RouteImpls:
+    api_to_routes = get_all_api_routes(external_apis)
    route_impls: RouteImpls = {}

    def _convert_path_to_regex(path: str) -> str:
@ -88,10 +91,10 @@ def initialize_route_impls(impls: dict[Api, Any]) -> RouteImpls:

        return f"^{pattern}$"

-    for api, api_routes in routes.items():
+    for api, api_routes in api_to_routes.items():
        if api not in impls:
            continue
-        for route in api_routes:
+        for route, webmethod in api_routes:
            impl = impls[api]
            func = getattr(impl, route.name)
            # Get the first (and typically only) method from the set, filtering out HEAD
@ -104,6 +107,7 @@ def initialize_route_impls(impls: dict[Api, Any]) -> RouteImpls:
            route_impls[method][_convert_path_to_regex(route.path)] = (
                func,
                route.path,
+                webmethod,
            )

    return route_impls
@ -118,7 +122,7 @@ def find_matching_route(method: str, path: str, route_impls: RouteImpls) -> Rout
        route_impls: A dictionary of endpoint implementations

    Returns:
-        A tuple of (endpoint_function, path_params, descriptive_name)
+        A tuple of (endpoint_function, path_params, route_path, webmethod_metadata)

    Raises:
        ValueError: If no matching endpoint is found
@ -127,11 +131,11 @@ def find_matching_route(method: str, path: str, route_impls: RouteImpls) -> Rout
    if not impls:
        raise ValueError(f"No endpoint found for {path}")

-    for regex, (func, descriptive_name) in impls.items():
+    for regex, (func, route_path, webmethod) in impls.items():
        match = re.match(regex, path)
        if match:
            # Extract named groups from the regex match
            path_params = match.groupdict()
-            return func, path_params, descriptive_name
+            return func, path_params, route_path, webmethod

    raise ValueError(f"No endpoint found for {path}")
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -32,27 +32,36 @@ from openai import BadRequestError
 from pydantic import BaseModel, ValidationError

 from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.distribution.access_control.access_control import AccessDeniedError
-from llama_stack.distribution.datatypes import (
+from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
+from llama_stack.core.access_control.access_control import AccessDeniedError
+from llama_stack.core.datatypes import (
    AuthenticationRequiredError,
    LoggingConfig,
    StackRunConfig,
 )
-from llama_stack.distribution.distribution import builtin_automatically_routed_apis
-from llama_stack.distribution.request_headers import PROVIDER_DATA_VAR, User, request_provider_data_context
-from llama_stack.distribution.resolver import InvalidProviderError
-from llama_stack.distribution.server.routes import (
+from llama_stack.core.distribution import builtin_automatically_routed_apis
+from llama_stack.core.external import ExternalApiSpec, load_external_apis
+from llama_stack.core.request_headers import (
+    PROVIDER_DATA_VAR,
+    request_provider_data_context,
+    user_from_scope,
+)
+from llama_stack.core.resolver import InvalidProviderError
+from llama_stack.core.server.routes import (
    find_matching_route,
    get_all_api_routes,
    initialize_route_impls,
 )
-from llama_stack.distribution.stack import (
+from llama_stack.core.stack import (
+    cast_image_name_to_string,
    construct_stack,
    replace_env_vars,
+    shutdown_stack,
    validate_env_pair,
 )
-from llama_stack.distribution.utils.config import redact_sensitive_fields
-from llama_stack.distribution.utils.context import preserve_contexts_async_generator
+from llama_stack.core.utils.config import redact_sensitive_fields
+from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
+from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api
 from llama_stack.providers.inline.telemetry.meta_reference.config import TelemetryConfig
@ -143,18 +152,7 @@ async def shutdown(app):
    Handled by the lifespan context manager. The shutdown process involves
    shutting down all implementations registered in the application.
    """
-    for impl in app.__llama_stack_impls__.values():
-        impl_name = impl.__class__.__name__
-        logger.info("Shutting down %s", impl_name)
-        try:
-            if hasattr(impl, "shutdown"):
-                await asyncio.wait_for(impl.shutdown(), timeout=5)
-            else:
-                logger.warning("No shutdown method for %s", impl_name)
-        except TimeoutError:
-            logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True)
-        except (Exception, asyncio.CancelledError) as e:
-            logger.exception("Failed to shutdown %s: %s", impl_name, {e})
+    await shutdown_stack(app.__llama_stack_impls__)


@asynccontextmanager
@ -219,9 +217,7 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
    @functools.wraps(func)
    async def route_handler(request: Request, **kwargs):
        # Get auth attributes from the request scope
-        user_attributes = request.scope.get("user_attributes", {})
-        principal = request.scope.get("principal", "")
-        user = User(principal=principal, attributes=user_attributes)
+        user = user_from_scope(request.scope)

        await log_request_pre_validation(request)

@ -279,9 +275,10 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:


 class TracingMiddleware:
-    def __init__(self, app, impls):
+    def __init__(self, app, impls, external_apis: dict[str, ExternalApiSpec]):
        self.app = app
        self.impls = impls
+        self.external_apis = external_apis
        # FastAPI built-in paths that should bypass custom routing
        self.fastapi_paths = ("/docs", "/redoc", "/openapi.json", "/favicon.ico", "/static")

@ -298,10 +295,12 @@ class TracingMiddleware:
            return await self.app(scope, receive, send)

        if not hasattr(self, "route_impls"):
-            self.route_impls = initialize_route_impls(self.impls)
+            self.route_impls = initialize_route_impls(self.impls, self.external_apis)

        try:
-            _, _, trace_path = find_matching_route(scope.get("method", hdrs.METH_GET), path, self.route_impls)
+            _, _, route_path, webmethod = find_matching_route(
+                scope.get("method", hdrs.METH_GET), path, self.route_impls
+            )
        except ValueError:
            # If no matching endpoint is found, pass through to FastAPI
            logger.debug(f"No matching route found for path: {path}, falling back to FastAPI")
@ -318,6 +317,7 @@ class TracingMiddleware:
        if tracestate:
            trace_attributes["tracestate"] = tracestate

+        trace_path = webmethod.descriptive_name or route_path
        trace_context = await start_trace(trace_path, trace_attributes)

        async def send_with_trace_id(message):
@ -376,20 +376,8 @@ class ClientVersionMiddleware:
 def main(args: argparse.Namespace | None = None):
    """Start the LlamaStack server."""
    parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
-    parser.add_argument(
-        "--yaml-config",
-        dest="config",
-        help="(Deprecated) Path to YAML configuration file - use --config instead",
-    )
-    parser.add_argument(
-        "--config",
-        dest="config",
-        help="Path to YAML configuration file",
-    )
-    parser.add_argument(
-        "--template",
-        help="One of the template names in llama_stack/templates (e.g., tgi, fireworks, remote-vllm, etc.)",
-    )
+
+    add_config_distro_args(parser)
    parser.add_argument(
        "--port",
        type=int,
@ -408,20 +396,8 @@ def main(args: argparse.Namespace | None = None):
    if args is None:
        args = parser.parse_args()

-    log_line = ""
-    if hasattr(args, "config") and args.config:
-        # if the user provided a config file, use it, even if template was specified
-        config_file = Path(args.config)
-        if not config_file.exists():
-            raise ValueError(f"Config file {config_file} does not exist")
-        log_line = f"Using config file: {config_file}"
-    elif hasattr(args, "template") and args.template:
-        config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.template / "run.yaml"
-        if not config_file.exists():
-            raise ValueError(f"Template {args.template} does not exist")
-        log_line = f"Using template {args.template} config file: {config_file}"
-    else:
-        raise ValueError("Either --config or --template must be provided")
+    config_or_distro = get_config_from_args(args)
+    config_file = resolve_config_or_distro(config_or_distro, Mode.RUN)

    logger_config = None
    with open(config_file) as fp:
@ -439,14 +415,9 @@ def main(args: argparse.Namespace | None = None):
                    logger.error(f"Error: {str(e)}")
                    sys.exit(1)
        config = replace_env_vars(config_contents)
-        config = StackRunConfig(**config)
+        config = StackRunConfig(**cast_image_name_to_string(config))

-    # now that the logger is initialized, print the line about which type of config we are using.
-    logger.info(log_line)
-
-    logger.info("Run configuration:")
-    safe_config = redact_sensitive_fields(config.model_dump(mode="json"))
-    logger.info(yaml.dump(safe_config, indent=2))
+    _log_run_config(run_config=config)

    app = FastAPI(
        lifespan=lifespan,
@ -454,13 +425,25 @@ def main(args: argparse.Namespace | None = None):
        redoc_url="/redoc",
        openapi_url="/openapi.json",
    )
+
    if not os.environ.get("LLAMA_STACK_DISABLE_VERSION_CHECK"):
        app.add_middleware(ClientVersionMiddleware)

-    # Add authentication middleware if configured
+    try:
+        # Create and set the event loop that will be used for both construction and server runtime
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
+        # Construct the stack in the persistent event loop
+        impls = loop.run_until_complete(construct_stack(config))
+
+    except InvalidProviderError as e:
+        logger.error(f"Error: {str(e)}")
+        sys.exit(1)
+
    if config.server.auth:
        logger.info(f"Enabling authentication with provider: {config.server.auth.provider_config.type.value}")
-        app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)
+        app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth, impls=impls)
    else:
        if config.server.quota:
            quota = config.server.quota
@ -491,18 +474,14 @@ def main(args: argparse.Namespace | None = None):
            window_seconds=window_seconds,
        )

-    try:
-        impls = asyncio.run(construct_stack(config))
-    except InvalidProviderError as e:
-        logger.error(f"Error: {str(e)}")
-        sys.exit(1)
-
    if Api.telemetry in impls:
        setup_logger(impls[Api.telemetry])
    else:
        setup_logger(TelemetryAdapter(TelemetryConfig(), {}))

-    all_routes = get_all_api_routes()
+    # Load external APIs if configured
+    external_apis = load_external_apis(config)
+    all_routes = get_all_api_routes(external_apis)

    if config.apis:
        apis_to_serve = set(config.apis)
@ -521,9 +500,12 @@ def main(args: argparse.Namespace | None = None):
        api = Api(api_str)

        routes = all_routes[api]
-        impl = impls[api]
+        try:
+            impl = impls[api]
+        except KeyError as e:
+            raise ValueError(f"Could not find provider implementation for {api} API") from e

-        for route in routes:
+        for route, _ in routes:
            if not hasattr(impl, route.name):
                # ideally this should be a typing violation already
                raise ValueError(f"Could not find method {route.name} on {impl}!")
@ -552,7 +534,7 @@ def main(args: argparse.Namespace | None = None):
    app.exception_handler(Exception)(global_exception_handler)

    app.__llama_stack_impls__ = impls
-    app.add_middleware(TracingMiddleware, impls=impls)
+    app.add_middleware(TracingMiddleware, impls=impls, external_apis=external_apis)

    import uvicorn

@ -586,11 +568,37 @@ def main(args: argparse.Namespace | None = None):
        "port": port,
        "lifespan": "on",
        "log_level": logger.getEffectiveLevel(),
+        "log_config": logger_config,
    }
    if ssl_config:
        uvicorn_config.update(ssl_config)

-    uvicorn.run(**uvicorn_config)
+    # Run uvicorn in the existing event loop to preserve background tasks
+    # We need to catch KeyboardInterrupt because uvicorn's signal handling
+    # re-raises SIGINT signals using signal.raise_signal(), which Python
+    # converts to KeyboardInterrupt. Without this catch, we'd get a confusing
+    # stack trace when using Ctrl+C or kill -2 (SIGINT).
+    # SIGTERM (kill -15) works fine without this because Python doesn't
+    # have a default handler for it.
+    #
+    # Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
+    # signal handling but this is quite intrusive and not worth the effort.
+    try:
+        loop.run_until_complete(uvicorn.Server(uvicorn.Config(**uvicorn_config)).serve())
+    except (KeyboardInterrupt, SystemExit):
+        logger.info("Received interrupt signal, shutting down gracefully...")
+    finally:
+        if not loop.is_closed():
+            logger.debug("Closing event loop")
+            loop.close()
+
+
+def _log_run_config(run_config: StackRunConfig):
+    """Logs the run config with redacted fields and disabled providers removed."""
+    logger.info("Run configuration:")
+    safe_config = redact_sensitive_fields(run_config.model_dump(mode="json"))
+    clean_config = remove_disabled_providers(safe_config)
+    logger.info(yaml.dump(clean_config, indent=2))


 def extract_path_params(route: str) -> list[str]:
@ -601,5 +609,17 @@ def extract_path_params(route: str) -> list[str]:
    return params


+def remove_disabled_providers(obj):
+    if isinstance(obj, dict):
+        keys = ["provider_id", "shield_id", "provider_model_id", "model_id"]
+        if any(k in obj and obj[k] in ("__disabled__", "", None) for k in keys):
+            return None
+        return {k: v for k, v in ((k, remove_disabled_providers(v)) for k, v in obj.items()) if v is not None}
+    elif isinstance(obj, list):
+        return [item for item in (remove_disabled_providers(i) for i in obj) if item is not None]
+    else:
+        return obj
+
+
 if __name__ == "__main__":
    main()
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import asyncio
 import importlib.resources
 import os
 import re
@ -33,13 +34,14 @@ from llama_stack.apis.telemetry import Telemetry
 from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime
 from llama_stack.apis.vector_dbs import VectorDBs
 from llama_stack.apis.vector_io import VectorIO
-from llama_stack.distribution.datatypes import Provider, StackRunConfig
-from llama_stack.distribution.distribution import get_provider_registry
-from llama_stack.distribution.inspect import DistributionInspectConfig, DistributionInspectImpl
-from llama_stack.distribution.providers import ProviderImpl, ProviderImplConfig
-from llama_stack.distribution.resolver import ProviderRegistry, resolve_impls
-from llama_stack.distribution.store.registry import create_dist_registry
-from llama_stack.distribution.utils.dynamic import instantiate_class_type
+from llama_stack.core.datatypes import Provider, StackRunConfig
+from llama_stack.core.distribution import get_provider_registry
+from llama_stack.core.inspect import DistributionInspectConfig, DistributionInspectImpl
+from llama_stack.core.providers import ProviderImpl, ProviderImplConfig
+from llama_stack.core.resolver import ProviderRegistry, resolve_impls
+from llama_stack.core.routing_tables.common import CommonRoutingTableImpl
+from llama_stack.core.store.registry import create_dist_registry
+from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api

@ -90,6 +92,11 @@ RESOURCES = [
 ]


+REGISTRY_REFRESH_INTERVAL_SECONDS = 300
+REGISTRY_REFRESH_TASK = None
+TEST_RECORDING_CONTEXT = None
+
+
 async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
    for rsrc, api, register_method, list_method in RESOURCES:
        objects = getattr(run_config, rsrc)
@ -99,23 +106,10 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
        method = getattr(impls[api], register_method)
        for obj in objects:
            logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")
-            # Do not register models on disabled providers
-            if hasattr(obj, "provider_id") and obj.provider_id is not None and obj.provider_id == "__disabled__":
-                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
-                continue
-            # In complex templates, like our starter template, we may have dynamic model ids
-            # given by environment variables. This allows those environment variables to have
-            # a default value of __disabled__ to skip registration of the model if not set.
-            if (
-                hasattr(obj, "provider_model_id")
-                and obj.provider_model_id is not None
-                and "__disabled__" in obj.provider_model_id
-            ):
-                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled model.")
-                continue

-            if hasattr(obj, "shield_id") and obj.shield_id is not None and obj.shield_id == "__disabled__":
-                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled shield.")
+            # Do not register models on disabled providers
+            if hasattr(obj, "provider_id") and (not obj.provider_id or obj.provider_id == "__disabled__"):
+                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
                continue

            # we want to maintain the type information in arguments to method.
@ -172,7 +166,6 @@ def replace_env_vars(config: Any, path: str = "") -> Any:
                            # Create a copy with resolved provider_id but original config
                            disabled_provider = v.copy()
                            disabled_provider["provider_id"] = resolved_provider_id
-                            result.append(disabled_provider)
                            continue
                    except EnvVarError:
                        # If we can't resolve the provider_id, continue with normal processing
@ -267,6 +260,13 @@ def _convert_string_to_proper_type(value: str) -> Any:
    return value


+def cast_image_name_to_string(config_dict: dict[str, Any]) -> dict[str, Any]:
+    """Ensure that any value for a key 'image_name' in a config_dict is a string"""
+    if "image_name" in config_dict and config_dict["image_name"] is not None:
+        config_dict["image_name"] = str(config_dict["image_name"])
+    return config_dict
+
+
 def validate_env_pair(env_pair: str) -> tuple[str, str]:
    """Validate and split an environment variable key-value pair."""
    try:
@ -308,6 +308,15 @@ def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConf
 async def construct_stack(
    run_config: StackRunConfig, provider_registry: ProviderRegistry | None = None
 ) -> dict[Api, Any]:
+    if "LLAMA_STACK_TEST_INFERENCE_MODE" in os.environ:
+        from llama_stack.testing.inference_recorder import setup_inference_recording
+
+        global TEST_RECORDING_CONTEXT
+        TEST_RECORDING_CONTEXT = setup_inference_recording()
+        if TEST_RECORDING_CONTEXT:
+            TEST_RECORDING_CONTEXT.__enter__()
+            logger.info(f"Inference recording enabled: mode={os.environ.get('LLAMA_STACK_TEST_INFERENCE_MODE')}")
+
    dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name)
    policy = run_config.server.auth.access_policy if run_config.server.auth else []
    impls = await resolve_impls(
@ -318,15 +327,74 @@ async def construct_stack(
    add_internal_implementations(impls, run_config)

    await register_resources(run_config, impls)
+
+    await refresh_registry_once(impls)
+
+    global REGISTRY_REFRESH_TASK
+    REGISTRY_REFRESH_TASK = asyncio.create_task(refresh_registry_task(impls))
+
+    def cb(task):
+        import traceback
+
+        if task.cancelled():
+            logger.error("Model refresh task cancelled")
+        elif task.exception():
+            logger.error(f"Model refresh task failed: {task.exception()}")
+            traceback.print_exception(task.exception())
+        else:
+            logger.debug("Model refresh task completed")
+
+    REGISTRY_REFRESH_TASK.add_done_callback(cb)
    return impls


-def get_stack_run_config_from_template(template: str) -> StackRunConfig:
-    template_path = importlib.resources.files("llama_stack") / f"templates/{template}/run.yaml"
+async def shutdown_stack(impls: dict[Api, Any]):
+    for impl in impls.values():
+        impl_name = impl.__class__.__name__
+        logger.info(f"Shutting down {impl_name}")
+        try:
+            if hasattr(impl, "shutdown"):
+                await asyncio.wait_for(impl.shutdown(), timeout=5)
+            else:
+                logger.warning(f"No shutdown method for {impl_name}")
+        except TimeoutError:
+            logger.exception(f"Shutdown timeout for {impl_name}")
+        except (Exception, asyncio.CancelledError) as e:
+            logger.exception(f"Failed to shutdown {impl_name}: {e}")

-    with importlib.resources.as_file(template_path) as path:
+    global TEST_RECORDING_CONTEXT
+    if TEST_RECORDING_CONTEXT:
+        try:
+            TEST_RECORDING_CONTEXT.__exit__(None, None, None)
+        except Exception as e:
+            logger.error(f"Error during inference recording cleanup: {e}")
+
+    global REGISTRY_REFRESH_TASK
+    if REGISTRY_REFRESH_TASK:
+        REGISTRY_REFRESH_TASK.cancel()
+
+
+async def refresh_registry_once(impls: dict[Api, Any]):
+    logger.debug("refreshing registry")
+    routing_tables = [v for v in impls.values() if isinstance(v, CommonRoutingTableImpl)]
+    for routing_table in routing_tables:
+        await routing_table.refresh()
+
+
+async def refresh_registry_task(impls: dict[Api, Any]):
+    logger.info("starting registry refresh task")
+    while True:
+        await refresh_registry_once(impls)
+
+        await asyncio.sleep(REGISTRY_REFRESH_INTERVAL_SECONDS)
+
+
+def get_stack_run_config_from_distro(distro: str) -> StackRunConfig:
+    distro_path = importlib.resources.files("llama_stack") / f"distributions/{distro}/run.yaml"
+
+    with importlib.resources.as_file(distro_path) as path:
        if not path.exists():
-            raise ValueError(f"Template '{template}' not found at {template_path}")
+            raise ValueError(f"Distribution '{distro}' not found at {distro_path}")
        run_config = yaml.safe_load(path.open())

    return StackRunConfig(**replace_env_vars(run_config))
--- a/llama_stack/distribution/start_stack.sh
+++ b/llama_stack/distribution/start_stack.sh
@ -40,7 +40,6 @@ port="$1"
 shift

 SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
-source "$SCRIPT_DIR/common.sh"

 # Initialize variables
 yaml_config=""
@ -75,9 +74,9 @@ while [[ $# -gt 0 ]]; do
  esac
 done

-# Check if yaml_config is required based on env_type
-if [[ "$env_type" == "venv" || "$env_type" == "conda" ]] && [ -z "$yaml_config" ]; then
-  echo -e "${RED}Error: --config is required for venv and conda environments${NC}" >&2
+# Check if yaml_config is required
+if [[ "$env_type" == "venv" ]] && [ -z "$yaml_config" ]; then
+  echo -e "${RED}Error: --config is required for venv environment${NC}" >&2
  exit 1
 fi

@ -101,28 +100,23 @@ case "$env_type" in
        source "$env_path_or_name/bin/activate"
    fi
    ;;
-  "conda")
-    if ! is_command_available conda; then
-        echo -e "${RED}Error: conda not found" >&2
-        exit 1
-    fi
-    eval "$(conda shell.bash hook)"
-    conda deactivate && conda activate "$env_path_or_name"
-    PYTHON_BINARY="$CONDA_PREFIX/bin/python"
-    ;;
  *)
+    # Handle unsupported env_types here
+    echo -e "${RED}Error: Unsupported environment type '$env_type'. Only 'venv' is supported.${NC}" >&2
+    exit 1
+    ;;
 esac

-if [[ "$env_type" == "venv" || "$env_type" == "conda" ]]; then
+if [[ "$env_type" == "venv" ]]; then
    set -x

    if [ -n "$yaml_config" ]; then
-        yaml_config_arg="--config $yaml_config"
+        yaml_config_arg="$yaml_config"
    else
        yaml_config_arg=""
    fi

-    $PYTHON_BINARY -m llama_stack.distribution.server.server \
+    $PYTHON_BINARY -m llama_stack.core.server.server \
    $yaml_config_arg \
    --port "$port" \
    $env_vars \
--- a/llama_stack/distribution/store/init.py
+++ b/llama_stack/distribution/store/init.py
--- a/llama_stack/distribution/store/registry.py
+++ b/llama_stack/distribution/store/registry.py
@ -10,8 +10,8 @@ from typing import Protocol

 import pydantic

-from llama_stack.distribution.datatypes import RoutableObjectWithProvider
-from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
+from llama_stack.core.datatypes import RoutableObjectWithProvider
+from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
 from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
--- a/llama_stack/distribution/ui/Containerfile
+++ b/llama_stack/distribution/ui/Containerfile
--- a/llama_stack/distribution/ui/README.md
+++ b/llama_stack/distribution/ui/README.md
@ -9,7 +9,7 @@
 1. Start up Llama Stack API server. More details [here](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).

 ```
-llama stack build --template together --image-type conda
+llama stack build --distro together --image-type venv

 llama stack run together
 ```
@ -36,7 +36,7 @@ llama-stack-client benchmarks register \
 3. Start Streamlit UI

 ```bash
-uv run --with ".[ui]" streamlit run llama_stack/distribution/ui/app.py
+uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
 ```

 ## Environment Variables
--- a/llama_stack/distribution/ui/init.py
+++ b/llama_stack/distribution/ui/init.py
--- a/llama_stack/distribution/ui/app.py
+++ b/llama_stack/distribution/ui/app.py
--- a/llama_stack/distribution/ui/modules/init.py
+++ b/llama_stack/distribution/ui/modules/init.py
--- a/llama_stack/distribution/ui/modules/api.py
+++ b/llama_stack/distribution/ui/modules/api.py
--- a/llama_stack/distribution/ui/modules/utils.py
+++ b/llama_stack/distribution/ui/modules/utils.py
--- a/llama_stack/distribution/ui/page/init.py
+++ b/llama_stack/distribution/ui/page/init.py
--- a/llama_stack/distribution/ui/page/distribution/init.py
+++ b/llama_stack/distribution/ui/page/distribution/init.py
--- a/llama_stack/distribution/ui/page/distribution/datasets.py
+++ b/llama_stack/distribution/ui/page/distribution/datasets.py
@ -6,7 +6,7 @@

 import streamlit as st

-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.core.ui.modules.api import llama_stack_api


 def datasets():
--- a/llama_stack/distribution/ui/page/distribution/eval_tasks.py
+++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py
@ -6,7 +6,7 @@

 import streamlit as st

-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.core.ui.modules.api import llama_stack_api


 def benchmarks():
--- a/llama_stack/distribution/ui/page/distribution/models.py
+++ b/llama_stack/distribution/ui/page/distribution/models.py
@ -6,7 +6,7 @@

 import streamlit as st

-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.core.ui.modules.api import llama_stack_api


 def models():
--- a/llama_stack/distribution/ui/page/distribution/providers.py
+++ b/llama_stack/distribution/ui/page/distribution/providers.py
@ -6,7 +6,7 @@

 import streamlit as st

-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.core.ui.modules.api import llama_stack_api


 def providers():
--- a/llama_stack/distribution/ui/page/distribution/resources.py
+++ b/llama_stack/distribution/ui/page/distribution/resources.py
@ -6,12 +6,12 @@

 from streamlit_option_menu import option_menu

-from llama_stack.distribution.ui.page.distribution.datasets import datasets
-from llama_stack.distribution.ui.page.distribution.eval_tasks import benchmarks
-from llama_stack.distribution.ui.page.distribution.models import models
-from llama_stack.distribution.ui.page.distribution.scoring_functions import scoring_functions
-from llama_stack.distribution.ui.page.distribution.shields import shields
-from llama_stack.distribution.ui.page.distribution.vector_dbs import vector_dbs
+from llama_stack.core.ui.page.distribution.datasets import datasets
+from llama_stack.core.ui.page.distribution.eval_tasks import benchmarks
+from llama_stack.core.ui.page.distribution.models import models
+from llama_stack.core.ui.page.distribution.scoring_functions import scoring_functions
+from llama_stack.core.ui.page.distribution.shields import shields
+from llama_stack.core.ui.page.distribution.vector_dbs import vector_dbs


 def resources_page():
--- a/llama_stack/distribution/ui/page/distribution/scoring_functions.py
+++ b/llama_stack/distribution/ui/page/distribution/scoring_functions.py
@ -6,7 +6,7 @@

 import streamlit as st

-from llama_stack.distribution.ui.modules.api import llama_stack_api
+from llama_stack.core.ui.modules.api import llama_stack_api


 def scoring_functions():
--- a/Show more
+++ b/Show more