Merge branch 'main' into feat/gunicorn-production-server

2025-12-03 09:53:45 +00:00 · 2025-11-24 12:08:57 +02:00 · 2025-11-24 12:08:57 +02:00 · 893d49c59e
commit 893d49c59e
parent 8fb237b6fb 3434c92a14
2086 changed files with 133277 additions and 643859 deletions
--- a/src/llama_stack/init.py
+++ b/src/llama_stack/init.py
@ -3,8 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-from llama_stack.core.library_client import (  # noqa: F401
-    AsyncLlamaStackAsLibraryClient,
-    LlamaStackAsLibraryClient,
-)
--- a/src/llama_stack/apis/init.py
+++ b/src/llama_stack/apis/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/apis/agents/init.py
+++ b/src/llama_stack/apis/agents/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .agents import *
--- a/src/llama_stack/apis/agents/agents.py
+++ b/src/llama_stack/apis/agents/agents.py
@ -1,814 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from collections.abc import AsyncIterator
-from datetime import datetime
-from enum import StrEnum
-from typing import Annotated, Any, Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel, ConfigDict, Field
-
-from llama_stack.apis.common.content_types import URL, ContentDelta, InterleavedContent
-from llama_stack.apis.common.responses import Order, PaginatedResponse
-from llama_stack.apis.inference import (
-    CompletionMessage,
-    ResponseFormat,
-    SamplingParams,
-    ToolCall,
-    ToolChoice,
-    ToolConfig,
-    ToolPromptFormat,
-    ToolResponse,
-    ToolResponseMessage,
-    UserMessage,
-)
-from llama_stack.apis.safety import SafetyViolation
-from llama_stack.apis.tools import ToolDef
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
-from llama_stack.schema_utils import ExtraBodyField, json_schema_type, register_schema, webmethod
-
-from .openai_responses import (
-    ListOpenAIResponseInputItem,
-    ListOpenAIResponseObject,
-    OpenAIDeleteResponseObject,
-    OpenAIResponseInput,
-    OpenAIResponseInputTool,
-    OpenAIResponseObject,
-    OpenAIResponseObjectStream,
-    OpenAIResponsePrompt,
-    OpenAIResponseText,
-)
-
-
-@json_schema_type
-class ResponseGuardrailSpec(BaseModel):
-    """Specification for a guardrail to apply during response generation.
-
-    :param type: The type/identifier of the guardrail.
-    """
-
-    type: str
-    # TODO: more fields to be added for guardrail configuration
-
-
-ResponseGuardrail = str | ResponseGuardrailSpec
-
-
-class Attachment(BaseModel):
-    """An attachment to an agent turn.
-
-    :param content: The content of the attachment.
-    :param mime_type: The MIME type of the attachment.
-    """
-
-    content: InterleavedContent | URL
-    mime_type: str
-
-
-class Document(BaseModel):
-    """A document to be used by an agent.
-
-    :param content: The content of the document.
-    :param mime_type: The MIME type of the document.
-    """
-
-    content: InterleavedContent | URL
-    mime_type: str
-
-
-class StepCommon(BaseModel):
-    """A common step in an agent turn.
-
-    :param turn_id: The ID of the turn.
-    :param step_id: The ID of the step.
-    :param started_at: The time the step started.
-    :param completed_at: The time the step completed.
-    """
-
-    turn_id: str
-    step_id: str
-    started_at: datetime | None = None
-    completed_at: datetime | None = None
-
-
-class StepType(StrEnum):
-    """Type of the step in an agent turn.
-
-    :cvar inference: The step is an inference step that calls an LLM.
-    :cvar tool_execution: The step is a tool execution step that executes a tool call.
-    :cvar shield_call: The step is a shield call step that checks for safety violations.
-    :cvar memory_retrieval: The step is a memory retrieval step that retrieves context for vector dbs.
-    """
-
-    inference = "inference"
-    tool_execution = "tool_execution"
-    shield_call = "shield_call"
-    memory_retrieval = "memory_retrieval"
-
-
-@json_schema_type
-class InferenceStep(StepCommon):
-    """An inference step in an agent turn.
-
-    :param model_response: The response from the LLM.
-    """
-
-    model_config = ConfigDict(protected_namespaces=())
-
-    step_type: Literal[StepType.inference] = StepType.inference
-    model_response: CompletionMessage
-
-
-@json_schema_type
-class ToolExecutionStep(StepCommon):
-    """A tool execution step in an agent turn.
-
-    :param tool_calls: The tool calls to execute.
-    :param tool_responses: The tool responses from the tool calls.
-    """
-
-    step_type: Literal[StepType.tool_execution] = StepType.tool_execution
-    tool_calls: list[ToolCall]
-    tool_responses: list[ToolResponse]
-
-
-@json_schema_type
-class ShieldCallStep(StepCommon):
-    """A shield call step in an agent turn.
-
-    :param violation: The violation from the shield call.
-    """
-
-    step_type: Literal[StepType.shield_call] = StepType.shield_call
-    violation: SafetyViolation | None
-
-
-@json_schema_type
-class MemoryRetrievalStep(StepCommon):
-    """A memory retrieval step in an agent turn.
-
-    :param vector_store_ids: The IDs of the vector databases to retrieve context from.
-    :param inserted_context: The context retrieved from the vector databases.
-    """
-
-    step_type: Literal[StepType.memory_retrieval] = StepType.memory_retrieval
-    # TODO: should this be List[str]?
-    vector_store_ids: str
-    inserted_context: InterleavedContent
-
-
-Step = Annotated[
-    InferenceStep | ToolExecutionStep | ShieldCallStep | MemoryRetrievalStep,
-    Field(discriminator="step_type"),
-]
-
-
-@json_schema_type
-class Turn(BaseModel):
-    """A single turn in an interaction with an Agentic System.
-
-    :param turn_id: Unique identifier for the turn within a session
-    :param session_id: Unique identifier for the conversation session
-    :param input_messages: List of messages that initiated this turn
-    :param steps: Ordered list of processing steps executed during this turn
-    :param output_message: The model's generated response containing content and metadata
-    :param output_attachments: (Optional) Files or media attached to the agent's response
-    :param started_at: Timestamp when the turn began
-    :param completed_at: (Optional) Timestamp when the turn finished, if completed
-    """
-
-    turn_id: str
-    session_id: str
-    input_messages: list[UserMessage | ToolResponseMessage]
-    steps: list[Step]
-    output_message: CompletionMessage
-    output_attachments: list[Attachment] | None = Field(default_factory=lambda: [])
-
-    started_at: datetime
-    completed_at: datetime | None = None
-
-
-@json_schema_type
-class Session(BaseModel):
-    """A single session of an interaction with an Agentic System.
-
-    :param session_id: Unique identifier for the conversation session
-    :param session_name: Human-readable name for the session
-    :param turns: List of all turns that have occurred in this session
-    :param started_at: Timestamp when the session was created
-    """
-
-    session_id: str
-    session_name: str
-    turns: list[Turn]
-    started_at: datetime
-
-
-class AgentToolGroupWithArgs(BaseModel):
-    name: str
-    args: dict[str, Any]
-
-
-AgentToolGroup = str | AgentToolGroupWithArgs
-register_schema(AgentToolGroup, name="AgentTool")
-
-
-class AgentConfigCommon(BaseModel):
-    sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
-
-    input_shields: list[str] | None = Field(default_factory=lambda: [])
-    output_shields: list[str] | None = Field(default_factory=lambda: [])
-    toolgroups: list[AgentToolGroup] | None = Field(default_factory=lambda: [])
-    client_tools: list[ToolDef] | None = Field(default_factory=lambda: [])
-    tool_choice: ToolChoice | None = Field(default=None, deprecated="use tool_config instead")
-    tool_prompt_format: ToolPromptFormat | None = Field(default=None, deprecated="use tool_config instead")
-    tool_config: ToolConfig | None = Field(default=None)
-
-    max_infer_iters: int | None = 10
-
-    def model_post_init(self, __context):
-        if self.tool_config:
-            if self.tool_choice and self.tool_config.tool_choice != self.tool_choice:
-                raise ValueError("tool_choice is deprecated. Use tool_choice in tool_config instead.")
-            if self.tool_prompt_format and self.tool_config.tool_prompt_format != self.tool_prompt_format:
-                raise ValueError("tool_prompt_format is deprecated. Use tool_prompt_format in tool_config instead.")
-        else:
-            params = {}
-            if self.tool_choice:
-                params["tool_choice"] = self.tool_choice
-            if self.tool_prompt_format:
-                params["tool_prompt_format"] = self.tool_prompt_format
-            self.tool_config = ToolConfig(**params)
-
-
-@json_schema_type
-class AgentConfig(AgentConfigCommon):
-    """Configuration for an agent.
-
-    :param model: The model identifier to use for the agent
-    :param instructions: The system instructions for the agent
-    :param name: Optional name for the agent, used in telemetry and identification
-    :param enable_session_persistence: Optional flag indicating whether session data has to be persisted
-    :param response_format: Optional response format configuration
-    """
-
-    model: str
-    instructions: str
-    name: str | None = None
-    enable_session_persistence: bool | None = False
-    response_format: ResponseFormat | None = None
-
-
-@json_schema_type
-class Agent(BaseModel):
-    """An agent instance with configuration and metadata.
-
-    :param agent_id: Unique identifier for the agent
-    :param agent_config: Configuration settings for the agent
-    :param created_at: Timestamp when the agent was created
-    """
-
-    agent_id: str
-    agent_config: AgentConfig
-    created_at: datetime
-
-
-class AgentConfigOverridablePerTurn(AgentConfigCommon):
-    instructions: str | None = None
-
-
-class AgentTurnResponseEventType(StrEnum):
-    step_start = "step_start"
-    step_complete = "step_complete"
-    step_progress = "step_progress"
-
-    turn_start = "turn_start"
-    turn_complete = "turn_complete"
-    turn_awaiting_input = "turn_awaiting_input"
-
-
-@json_schema_type
-class AgentTurnResponseStepStartPayload(BaseModel):
-    """Payload for step start events in agent turn responses.
-
-    :param event_type: Type of event being reported
-    :param step_type: Type of step being executed
-    :param step_id: Unique identifier for the step within a turn
-    :param metadata: (Optional) Additional metadata for the step
-    """
-
-    event_type: Literal[AgentTurnResponseEventType.step_start] = AgentTurnResponseEventType.step_start
-    step_type: StepType
-    step_id: str
-    metadata: dict[str, Any] | None = Field(default_factory=lambda: {})
-
-
-@json_schema_type
-class AgentTurnResponseStepCompletePayload(BaseModel):
-    """Payload for step completion events in agent turn responses.
-
-    :param event_type: Type of event being reported
-    :param step_type: Type of step being executed
-    :param step_id: Unique identifier for the step within a turn
-    :param step_details: Complete details of the executed step
-    """
-
-    event_type: Literal[AgentTurnResponseEventType.step_complete] = AgentTurnResponseEventType.step_complete
-    step_type: StepType
-    step_id: str
-    step_details: Step
-
-
-@json_schema_type
-class AgentTurnResponseStepProgressPayload(BaseModel):
-    """Payload for step progress events in agent turn responses.
-
-    :param event_type: Type of event being reported
-    :param step_type: Type of step being executed
-    :param step_id: Unique identifier for the step within a turn
-    :param delta: Incremental content changes during step execution
-    """
-
-    model_config = ConfigDict(protected_namespaces=())
-
-    event_type: Literal[AgentTurnResponseEventType.step_progress] = AgentTurnResponseEventType.step_progress
-    step_type: StepType
-    step_id: str
-
-    delta: ContentDelta
-
-
-@json_schema_type
-class AgentTurnResponseTurnStartPayload(BaseModel):
-    """Payload for turn start events in agent turn responses.
-
-    :param event_type: Type of event being reported
-    :param turn_id: Unique identifier for the turn within a session
-    """
-
-    event_type: Literal[AgentTurnResponseEventType.turn_start] = AgentTurnResponseEventType.turn_start
-    turn_id: str
-
-
-@json_schema_type
-class AgentTurnResponseTurnCompletePayload(BaseModel):
-    """Payload for turn completion events in agent turn responses.
-
-    :param event_type: Type of event being reported
-    :param turn: Complete turn data including all steps and results
-    """
-
-    event_type: Literal[AgentTurnResponseEventType.turn_complete] = AgentTurnResponseEventType.turn_complete
-    turn: Turn
-
-
-@json_schema_type
-class AgentTurnResponseTurnAwaitingInputPayload(BaseModel):
-    """Payload for turn awaiting input events in agent turn responses.
-
-    :param event_type: Type of event being reported
-    :param turn: Turn data when waiting for external tool responses
-    """
-
-    event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input] = AgentTurnResponseEventType.turn_awaiting_input
-    turn: Turn
-
-
-AgentTurnResponseEventPayload = Annotated[
-    AgentTurnResponseStepStartPayload
-    | AgentTurnResponseStepProgressPayload
-    | AgentTurnResponseStepCompletePayload
-    | AgentTurnResponseTurnStartPayload
-    | AgentTurnResponseTurnCompletePayload
-    | AgentTurnResponseTurnAwaitingInputPayload,
-    Field(discriminator="event_type"),
-]
-register_schema(AgentTurnResponseEventPayload, name="AgentTurnResponseEventPayload")
-
-
-@json_schema_type
-class AgentTurnResponseEvent(BaseModel):
-    """An event in an agent turn response stream.
-
-    :param payload: Event-specific payload containing event data
-    """
-
-    payload: AgentTurnResponseEventPayload
-
-
-@json_schema_type
-class AgentCreateResponse(BaseModel):
-    """Response returned when creating a new agent.
-
-    :param agent_id: Unique identifier for the created agent
-    """
-
-    agent_id: str
-
-
-@json_schema_type
-class AgentSessionCreateResponse(BaseModel):
-    """Response returned when creating a new agent session.
-
-    :param session_id: Unique identifier for the created session
-    """
-
-    session_id: str
-
-
-@json_schema_type
-class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
-    """Request to create a new turn for an agent.
-
-    :param agent_id: Unique identifier for the agent
-    :param session_id: Unique identifier for the conversation session
-    :param messages: List of messages to start the turn with
-    :param documents: (Optional) List of documents to provide to the agent
-    :param toolgroups: (Optional) List of tool groups to make available for this turn
-    :param stream: (Optional) Whether to stream the response
-    :param tool_config: (Optional) Tool configuration to override agent defaults
-    """
-
-    agent_id: str
-    session_id: str
-
-    # TODO: figure out how we can simplify this and make why
-    # ToolResponseMessage needs to be here (it is function call
-    # execution from outside the system)
-    messages: list[UserMessage | ToolResponseMessage]
-
-    documents: list[Document] | None = None
-    toolgroups: list[AgentToolGroup] | None = Field(default_factory=lambda: [])
-
-    stream: bool | None = False
-    tool_config: ToolConfig | None = None
-
-
-@json_schema_type
-class AgentTurnResumeRequest(BaseModel):
-    """Request to resume an agent turn with tool responses.
-
-    :param agent_id: Unique identifier for the agent
-    :param session_id: Unique identifier for the conversation session
-    :param turn_id: Unique identifier for the turn within a session
-    :param tool_responses: List of tool responses to submit to continue the turn
-    :param stream: (Optional) Whether to stream the response
-    """
-
-    agent_id: str
-    session_id: str
-    turn_id: str
-    tool_responses: list[ToolResponse]
-    stream: bool | None = False
-
-
-@json_schema_type
-class AgentTurnResponseStreamChunk(BaseModel):
-    """Streamed agent turn completion response.
-
-    :param event: Individual event in the agent turn response stream
-    """
-
-    event: AgentTurnResponseEvent
-
-
-@json_schema_type
-class AgentStepResponse(BaseModel):
-    """Response containing details of a specific agent step.
-
-    :param step: The complete step data and execution details
-    """
-
-    step: Step
-
-
-@runtime_checkable
-class Agents(Protocol):
-    """Agents
-
-    APIs for creating and interacting with agentic systems."""
-
-    @webmethod(
-        route="/agents",
-        method="POST",
-        descriptive_name="create_agent",
-        level=LLAMA_STACK_API_V1ALPHA,
-    )
-    async def create_agent(
-        self,
-        agent_config: AgentConfig,
-    ) -> AgentCreateResponse:
-        """Create an agent with the given configuration.
-
-        :param agent_config: The configuration for the agent.
-        :returns: An AgentCreateResponse with the agent ID.
-        """
-        ...
-
-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn",
-        method="POST",
-        descriptive_name="create_agent_turn",
-        level=LLAMA_STACK_API_V1ALPHA,
-    )
-    async def create_agent_turn(
-        self,
-        agent_id: str,
-        session_id: str,
-        messages: list[UserMessage | ToolResponseMessage],
-        stream: bool | None = False,
-        documents: list[Document] | None = None,
-        toolgroups: list[AgentToolGroup] | None = None,
-        tool_config: ToolConfig | None = None,
-    ) -> Turn | AsyncIterator[AgentTurnResponseStreamChunk]:
-        """Create a new turn for an agent.
-
-        :param agent_id: The ID of the agent to create the turn for.
-        :param session_id: The ID of the session to create the turn for.
-        :param messages: List of messages to start the turn with.
-        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
-        :param documents: (Optional) List of documents to create the turn with.
-        :param toolgroups: (Optional) List of toolgroups to create the turn with, will be used in addition to the agent's config toolgroups for the request.
-        :param tool_config: (Optional) The tool configuration to create the turn with, will be used to override the agent's tool_config.
-        :returns: If stream=False, returns a Turn object.
-                  If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk.
-        """
-        ...
-
-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
-        method="POST",
-        descriptive_name="resume_agent_turn",
-        level=LLAMA_STACK_API_V1ALPHA,
-    )
-    async def resume_agent_turn(
-        self,
-        agent_id: str,
-        session_id: str,
-        turn_id: str,
-        tool_responses: list[ToolResponse],
-        stream: bool | None = False,
-    ) -> Turn | AsyncIterator[AgentTurnResponseStreamChunk]:
-        """Resume an agent turn with executed tool call responses.
-
-        When a Turn has the status `awaiting_input` due to pending input from client side tool calls, this endpoint can be used to submit the outputs from the tool calls once they are ready.
-
-        :param agent_id: The ID of the agent to resume.
-        :param session_id: The ID of the session to resume.
-        :param turn_id: The ID of the turn to resume.
-        :param tool_responses: The tool call responses to resume the turn with.
-        :param stream: Whether to stream the response.
-        :returns: A Turn object if stream is False, otherwise an AsyncIterator of AgentTurnResponseStreamChunk objects.
-        """
-        ...
-
-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1ALPHA,
-    )
-    async def get_agents_turn(
-        self,
-        agent_id: str,
-        session_id: str,
-        turn_id: str,
-    ) -> Turn:
-        """Retrieve an agent turn by its ID.
-
-        :param agent_id: The ID of the agent to get the turn for.
-        :param session_id: The ID of the session to get the turn for.
-        :param turn_id: The ID of the turn to get.
-        :returns: A Turn.
-        """
-        ...
-
-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1ALPHA,
-    )
-    async def get_agents_step(
-        self,
-        agent_id: str,
-        session_id: str,
-        turn_id: str,
-        step_id: str,
-    ) -> AgentStepResponse:
-        """Retrieve an agent step by its ID.
-
-        :param agent_id: The ID of the agent to get the step for.
-        :param session_id: The ID of the session to get the step for.
-        :param turn_id: The ID of the turn to get the step for.
-        :param step_id: The ID of the step to get.
-        :returns: An AgentStepResponse.
-        """
-        ...
-
-    @webmethod(
-        route="/agents/{agent_id}/session",
-        method="POST",
-        descriptive_name="create_agent_session",
-        level=LLAMA_STACK_API_V1ALPHA,
-    )
-    async def create_agent_session(
-        self,
-        agent_id: str,
-        session_name: str,
-    ) -> AgentSessionCreateResponse:
-        """Create a new session for an agent.
-
-        :param agent_id: The ID of the agent to create the session for.
-        :param session_name: The name of the session to create.
-        :returns: An AgentSessionCreateResponse.
-        """
-        ...
-
-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1ALPHA,
-    )
-    async def get_agents_session(
-        self,
-        session_id: str,
-        agent_id: str,
-        turn_ids: list[str] | None = None,
-    ) -> Session:
-        """Retrieve an agent session by its ID.
-
-        :param session_id: The ID of the session to get.
-        :param agent_id: The ID of the agent to get the session for.
-        :param turn_ids: (Optional) List of turn IDs to filter the session by.
-        :returns: A Session.
-        """
-        ...
-
-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}",
-        method="DELETE",
-        level=LLAMA_STACK_API_V1ALPHA,
-    )
-    async def delete_agents_session(
-        self,
-        session_id: str,
-        agent_id: str,
-    ) -> None:
-        """Delete an agent session by its ID and its associated turns.
-
-        :param session_id: The ID of the session to delete.
-        :param agent_id: The ID of the agent to delete the session for.
-        """
-        ...
-
-    @webmethod(route="/agents/{agent_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
-    async def delete_agent(
-        self,
-        agent_id: str,
-    ) -> None:
-        """Delete an agent by its ID and its associated sessions and turns.
-
-        :param agent_id: The ID of the agent to delete.
-        """
-        ...
-
-    @webmethod(route="/agents", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def list_agents(self, start_index: int | None = None, limit: int | None = None) -> PaginatedResponse:
-        """List all agents.
-
-        :param start_index: The index to start the pagination from.
-        :param limit: The number of agents to return.
-        :returns: A PaginatedResponse.
-        """
-        ...
-
-    @webmethod(route="/agents/{agent_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def get_agent(self, agent_id: str) -> Agent:
-        """Describe an agent by its ID.
-
-        :param agent_id: ID of the agent.
-        :returns: An Agent of the agent.
-        """
-        ...
-
-    @webmethod(route="/agents/{agent_id}/sessions", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def list_agent_sessions(
-        self,
-        agent_id: str,
-        start_index: int | None = None,
-        limit: int | None = None,
-    ) -> PaginatedResponse:
-        """List all session(s) of a given agent.
-
-        :param agent_id: The ID of the agent to list sessions for.
-        :param start_index: The index to start the pagination from.
-        :param limit: The number of sessions to return.
-        :returns: A PaginatedResponse.
-        """
-        ...
-
-    # We situate the OpenAI Responses API in the Agents API just like we did things
-    # for Inference. The Responses API, in its intent, serves the same purpose as
-    # the Agents API above -- it is essentially a lightweight "agentic loop" with
-    # integrated tool calling.
-    #
-    # Both of these APIs are inherently stateful.
-
-    @webmethod(route="/responses/{response_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_openai_response(
-        self,
-        response_id: str,
-    ) -> OpenAIResponseObject:
-        """Get a model response.
-
-        :param response_id: The ID of the OpenAI response to retrieve.
-        :returns: An OpenAIResponseObject.
-        """
-        ...
-
-    @webmethod(route="/responses", method="POST", level=LLAMA_STACK_API_V1)
-    async def create_openai_response(
-        self,
-        input: str | list[OpenAIResponseInput],
-        model: str,
-        prompt: OpenAIResponsePrompt | None = None,
-        instructions: str | None = None,
-        previous_response_id: str | None = None,
-        conversation: str | None = None,
-        store: bool | None = True,
-        stream: bool | None = False,
-        temperature: float | None = None,
-        text: OpenAIResponseText | None = None,
-        tools: list[OpenAIResponseInputTool] | None = None,
-        include: list[str] | None = None,
-        max_infer_iters: int | None = 10,  # this is an extension to the OpenAI API
-        guardrails: Annotated[
-            list[ResponseGuardrail] | None,
-            ExtraBodyField(
-                "List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
-            ),
-        ] = None,
-    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
-        """Create a model response.
-
-        :param input: Input message(s) to create the response.
-        :param model: The underlying LLM used for completions.
-        :param prompt: (Optional) Prompt object with ID, version, and variables.
-        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
-        :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
-        :param include: (Optional) Additional fields to include in the response.
-        :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
-        :returns: An OpenAIResponseObject.
-        """
-        ...
-
-    @webmethod(route="/responses", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_openai_responses(
-        self,
-        after: str | None = None,
-        limit: int | None = 50,
-        model: str | None = None,
-        order: Order | None = Order.desc,
-    ) -> ListOpenAIResponseObject:
-        """List all responses.
-
-        :param after: The ID of the last response to return.
-        :param limit: The number of responses to return.
-        :param model: The model to filter responses by.
-        :param order: The order to sort responses by when sorted by created_at ('asc' or 'desc').
-        :returns: A ListOpenAIResponseObject.
-        """
-        ...
-
-    @webmethod(route="/responses/{response_id}/input_items", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_openai_response_input_items(
-        self,
-        response_id: str,
-        after: str | None = None,
-        before: str | None = None,
-        include: list[str] | None = None,
-        limit: int | None = 20,
-        order: Order | None = Order.desc,
-    ) -> ListOpenAIResponseInputItem:
-        """List input items.
-
-        :param response_id: The ID of the response to retrieve input items for.
-        :param after: An item ID to list items after, used for pagination.
-        :param before: An item ID to list items before, used for pagination.
-        :param include: Additional fields to include in the response.
-        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
-        :param order: The order to return the input items in. Default is desc.
-        :returns: An ListOpenAIResponseInputItem.
-        """
-        ...
-
-    @webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
-        """Delete a response.
-
-        :param response_id: The ID of the OpenAI response to delete.
-        :returns: An OpenAIDeleteResponseObject
-        """
-        ...
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
--- a/src/llama_stack/apis/batches/init.py
+++ b/src/llama_stack/apis/batches/init.py
@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .batches import Batches, BatchObject, ListBatchesResponse
-
-__all__ = ["Batches", "BatchObject", "ListBatchesResponse"]
--- a/src/llama_stack/apis/batches/batches.py
+++ b/src/llama_stack/apis/batches/batches.py
@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-try:
-    from openai.types import Batch as BatchObject
-except ImportError as e:
-    raise ImportError("OpenAI package is required for batches API. Please install it with: pip install openai") from e
-
-
-@json_schema_type
-class ListBatchesResponse(BaseModel):
-    """Response containing a list of batch objects."""
-
-    object: Literal["list"] = "list"
-    data: list[BatchObject] = Field(..., description="List of batch objects")
-    first_id: str | None = Field(default=None, description="ID of the first batch in the list")
-    last_id: str | None = Field(default=None, description="ID of the last batch in the list")
-    has_more: bool = Field(default=False, description="Whether there are more batches available")
-
-
-@runtime_checkable
-class Batches(Protocol):
-    """
-    The Batches API enables efficient processing of multiple requests in a single operation,
-    particularly useful for processing large datasets, batch evaluation workflows, and
-    cost-effective inference at scale.
-
-    The API is designed to allow use of openai client libraries for seamless integration.
-
-    This API provides the following extensions:
-     - idempotent batch creation
-
-    Note: This API is currently under active development and may undergo changes.
-    """
-
-    @webmethod(route="/batches", method="POST", level=LLAMA_STACK_API_V1)
-    async def create_batch(
-        self,
-        input_file_id: str,
-        endpoint: str,
-        completion_window: Literal["24h"],
-        metadata: dict[str, str] | None = None,
-        idempotency_key: str | None = None,
-    ) -> BatchObject:
-        """Create a new batch for processing multiple API requests.
-
-        :param input_file_id: The ID of an uploaded file containing requests for the batch.
-        :param endpoint: The endpoint to be used for all requests in the batch.
-        :param completion_window: The time window within which the batch should be processed.
-        :param metadata: Optional metadata for the batch.
-        :param idempotency_key: Optional idempotency key. When provided, enables idempotent behavior.
-        :returns: The created batch object.
-        """
-        ...
-
-    @webmethod(route="/batches/{batch_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def retrieve_batch(self, batch_id: str) -> BatchObject:
-        """Retrieve information about a specific batch.
-
-        :param batch_id: The ID of the batch to retrieve.
-        :returns: The batch object.
-        """
-        ...
-
-    @webmethod(route="/batches/{batch_id}/cancel", method="POST", level=LLAMA_STACK_API_V1)
-    async def cancel_batch(self, batch_id: str) -> BatchObject:
-        """Cancel a batch that is in progress.
-
-        :param batch_id: The ID of the batch to cancel.
-        :returns: The updated batch object.
-        """
-        ...
-
-    @webmethod(route="/batches", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_batches(
-        self,
-        after: str | None = None,
-        limit: int = 20,
-    ) -> ListBatchesResponse:
-        """List all batches for the current user.
-
-        :param after: A cursor for pagination; returns batches after this batch ID.
-        :param limit: Number of batches to return (default 20, max 100).
-        :returns: A list of batch objects.
-        """
-        ...
--- a/src/llama_stack/apis/benchmarks/init.py
+++ b/src/llama_stack/apis/benchmarks/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .benchmarks import *
--- a/src/llama_stack/apis/benchmarks/benchmarks.py
+++ b/src/llama_stack/apis/benchmarks/benchmarks.py
@ -1,104 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-class CommonBenchmarkFields(BaseModel):
-    dataset_id: str
-    scoring_functions: list[str]
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Metadata for this evaluation task",
-    )
-
-
-@json_schema_type
-class Benchmark(CommonBenchmarkFields, Resource):
-    """A benchmark resource for evaluating model performance.
-
-    :param dataset_id: Identifier of the dataset to use for the benchmark evaluation
-    :param scoring_functions: List of scoring function identifiers to apply during evaluation
-    :param metadata: Metadata for this evaluation task
-    :param type: The resource type, always benchmark
-    """
-
-    type: Literal[ResourceType.benchmark] = ResourceType.benchmark
-
-    @property
-    def benchmark_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_benchmark_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class BenchmarkInput(CommonBenchmarkFields, BaseModel):
-    benchmark_id: str
-    provider_id: str | None = None
-    provider_benchmark_id: str | None = None
-
-
-class ListBenchmarksResponse(BaseModel):
-    data: list[Benchmark]
-
-
-@runtime_checkable
-class Benchmarks(Protocol):
-    @webmethod(route="/eval/benchmarks", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def list_benchmarks(self) -> ListBenchmarksResponse:
-        """List all benchmarks.
-
-        :returns: A ListBenchmarksResponse.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def get_benchmark(
-        self,
-        benchmark_id: str,
-    ) -> Benchmark:
-        """Get a benchmark by its ID.
-
-        :param benchmark_id: The ID of the benchmark to get.
-        :returns: A Benchmark.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def register_benchmark(
-        self,
-        benchmark_id: str,
-        dataset_id: str,
-        scoring_functions: list[str],
-        provider_benchmark_id: str | None = None,
-        provider_id: str | None = None,
-        metadata: dict[str, Any] | None = None,
-    ) -> None:
-        """Register a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to register.
-        :param dataset_id: The ID of the dataset to use for the benchmark.
-        :param scoring_functions: The scoring functions to use for the benchmark.
-        :param provider_benchmark_id: The ID of the provider benchmark to use for the benchmark.
-        :param provider_id: The ID of the provider to use for the benchmark.
-        :param metadata: The metadata to use for the benchmark.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
-    async def unregister_benchmark(self, benchmark_id: str) -> None:
-        """Unregister a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to unregister.
-        """
-        ...
--- a/src/llama_stack/apis/common/init.py
+++ b/src/llama_stack/apis/common/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/apis/common/content_types.py
+++ b/src/llama_stack/apis/common/content_types.py
@ -1,143 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Annotated, Literal
-
-from pydantic import BaseModel, Field, model_validator
-
-from llama_stack.models.llama.datatypes import ToolCall
-from llama_stack.schema_utils import json_schema_type, register_schema
-
-
-@json_schema_type
-class URL(BaseModel):
-    """A URL reference to external content.
-
-    :param uri: The URL string pointing to the resource
-    """
-
-    uri: str
-
-
-class _URLOrData(BaseModel):
-    """
-    A URL or a base64 encoded string
-
-    :param url: A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits.
-    :param data: base64 encoded image data as string
-    """
-
-    url: URL | None = None
-    # data is a base64 encoded string, hint with contentEncoding=base64
-    data: str | None = Field(default=None, json_schema_extra={"contentEncoding": "base64"})
-
-    @model_validator(mode="before")
-    @classmethod
-    def validator(cls, values):
-        if isinstance(values, dict):
-            return values
-        return {"url": values}
-
-
-@json_schema_type
-class ImageContentItem(BaseModel):
-    """A image content item
-
-    :param type: Discriminator type of the content item. Always "image"
-    :param image: Image as a base64 encoded string or an URL
-    """
-
-    type: Literal["image"] = "image"
-    image: _URLOrData
-
-
-@json_schema_type
-class TextContentItem(BaseModel):
-    """A text content item
-
-    :param type: Discriminator type of the content item. Always "text"
-    :param text: Text content
-    """
-
-    type: Literal["text"] = "text"
-    text: str
-
-
-# other modalities can be added here
-InterleavedContentItem = Annotated[
-    ImageContentItem | TextContentItem,
-    Field(discriminator="type"),
-]
-register_schema(InterleavedContentItem, name="InterleavedContentItem")
-
-# accept a single "str" as a special case since it is common
-InterleavedContent = str | InterleavedContentItem | list[InterleavedContentItem]
-register_schema(InterleavedContent, name="InterleavedContent")
-
-
-@json_schema_type
-class TextDelta(BaseModel):
-    """A text content delta for streaming responses.
-
-    :param type: Discriminator type of the delta. Always "text"
-    :param text: The incremental text content
-    """
-
-    type: Literal["text"] = "text"
-    text: str
-
-
-@json_schema_type
-class ImageDelta(BaseModel):
-    """An image content delta for streaming responses.
-
-    :param type: Discriminator type of the delta. Always "image"
-    :param image: The incremental image data as bytes
-    """
-
-    type: Literal["image"] = "image"
-    image: bytes
-
-
-class ToolCallParseStatus(Enum):
-    """Status of tool call parsing during streaming.
-    :cvar started: Tool call parsing has begun
-    :cvar in_progress: Tool call parsing is ongoing
-    :cvar failed: Tool call parsing failed
-    :cvar succeeded: Tool call parsing completed successfully
-    """
-
-    started = "started"
-    in_progress = "in_progress"
-    failed = "failed"
-    succeeded = "succeeded"
-
-
-@json_schema_type
-class ToolCallDelta(BaseModel):
-    """A tool call content delta for streaming responses.
-
-    :param type: Discriminator type of the delta. Always "tool_call"
-    :param tool_call: Either an in-progress tool call string or the final parsed tool call
-    :param parse_status: Current parsing status of the tool call
-    """
-
-    type: Literal["tool_call"] = "tool_call"
-
-    # you either send an in-progress tool call so the client can stream a long
-    # code generation or you send the final parsed tool call at the end of the
-    # stream
-    tool_call: str | ToolCall
-    parse_status: ToolCallParseStatus
-
-
-# streaming completions send a stream of ContentDeltas
-ContentDelta = Annotated[
-    TextDelta | ImageDelta | ToolCallDelta,
-    Field(discriminator="type"),
-]
-register_schema(ContentDelta, name="ContentDelta")
--- a/src/llama_stack/apis/common/errors.py
+++ b/src/llama_stack/apis/common/errors.py
@ -1,103 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Custom Llama Stack Exception classes should follow the following schema
-#   1. All classes should inherit from an existing Built-In Exception class: https://docs.python.org/3/library/exceptions.html
-#   2. All classes should have a custom error message with the goal of informing the Llama Stack user specifically
-#   3. All classes should propogate the inherited __init__ function otherwise via 'super().__init__(message)'
-
-
-class ResourceNotFoundError(ValueError):
-    """generic exception for a missing Llama Stack resource"""
-
-    def __init__(self, resource_name: str, resource_type: str, client_list: str) -> None:
-        message = (
-            f"{resource_type} '{resource_name}' not found. Use '{client_list}' to list available {resource_type}s."
-        )
-        super().__init__(message)
-
-
-class UnsupportedModelError(ValueError):
-    """raised when model is not present in the list of supported models"""
-
-    def __init__(self, model_name: str, supported_models_list: list[str]):
-        message = f"'{model_name}' model is not supported. Supported models are: {', '.join(supported_models_list)}"
-        super().__init__(message)
-
-
-class ModelNotFoundError(ResourceNotFoundError):
-    """raised when Llama Stack cannot find a referenced model"""
-
-    def __init__(self, model_name: str) -> None:
-        super().__init__(model_name, "Model", "client.models.list()")
-
-
-class VectorStoreNotFoundError(ResourceNotFoundError):
-    """raised when Llama Stack cannot find a referenced vector store"""
-
-    def __init__(self, vector_store_name: str) -> None:
-        super().__init__(vector_store_name, "Vector Store", "client.vector_dbs.list()")
-
-
-class DatasetNotFoundError(ResourceNotFoundError):
-    """raised when Llama Stack cannot find a referenced dataset"""
-
-    def __init__(self, dataset_name: str) -> None:
-        super().__init__(dataset_name, "Dataset", "client.datasets.list()")
-
-
-class ToolGroupNotFoundError(ResourceNotFoundError):
-    """raised when Llama Stack cannot find a referenced tool group"""
-
-    def __init__(self, toolgroup_name: str) -> None:
-        super().__init__(toolgroup_name, "Tool Group", "client.toolgroups.list()")
-
-
-class SessionNotFoundError(ValueError):
-    """raised when Llama Stack cannot find a referenced session or access is denied"""
-
-    def __init__(self, session_name: str) -> None:
-        message = f"Session '{session_name}' not found or access denied."
-        super().__init__(message)
-
-
-class ModelTypeError(TypeError):
-    """raised when a model is present but not the correct type"""
-
-    def __init__(self, model_name: str, model_type: str, expected_model_type: str) -> None:
-        message = (
-            f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
-        )
-        super().__init__(message)
-
-
-class ConflictError(ValueError):
-    """raised when an operation cannot be performed due to a conflict with the current state"""
-
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
-
-
-class TokenValidationError(ValueError):
-    """raised when token validation fails during authentication"""
-
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
-
-
-class ConversationNotFoundError(ResourceNotFoundError):
-    """raised when Llama Stack cannot find a referenced conversation"""
-
-    def __init__(self, conversation_id: str) -> None:
-        super().__init__(conversation_id, "Conversation", "client.conversations.list()")
-
-
-class InvalidConversationIdError(ValueError):
-    """raised when a conversation ID has an invalid format"""
-
-    def __init__(self, conversation_id: str) -> None:
-        message = f"Invalid conversation ID '{conversation_id}'. Expected an ID that begins with 'conv_'."
-        super().__init__(message)
--- a/src/llama_stack/apis/common/job_types.py
+++ b/src/llama_stack/apis/common/job_types.py
@ -1,38 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from enum import Enum
-
-from pydantic import BaseModel
-
-from llama_stack.schema_utils import json_schema_type
-
-
-class JobStatus(Enum):
-    """Status of a job execution.
-    :cvar completed: Job has finished successfully
-    :cvar in_progress: Job is currently running
-    :cvar failed: Job has failed during execution
-    :cvar scheduled: Job is scheduled but not yet started
-    :cvar cancelled: Job was cancelled before completion
-    """
-
-    completed = "completed"
-    in_progress = "in_progress"
-    failed = "failed"
-    scheduled = "scheduled"
-    cancelled = "cancelled"
-
-
-@json_schema_type
-class Job(BaseModel):
-    """A job execution instance with status tracking.
-
-    :param job_id: Unique identifier for the job
-    :param status: Current execution status of the job
-    """
-
-    job_id: str
-    status: JobStatus
--- a/src/llama_stack/apis/common/responses.py
+++ b/src/llama_stack/apis/common/responses.py
@ -1,36 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any
-
-from pydantic import BaseModel
-
-from llama_stack.schema_utils import json_schema_type
-
-
-class Order(Enum):
-    """Sort order for paginated responses.
-    :cvar asc: Ascending order
-    :cvar desc: Descending order
-    """
-
-    asc = "asc"
-    desc = "desc"
-
-
-@json_schema_type
-class PaginatedResponse(BaseModel):
-    """A generic paginated response that follows a simple format.
-
-    :param data: The list of items for the current page
-    :param has_more: Whether there are more items available after this set
-    :param url: The URL for accessing this list
-    """
-
-    data: list[dict[str, Any]]
-    has_more: bool
-    url: str | None = None
--- a/src/llama_stack/apis/common/training_types.py
+++ b/src/llama_stack/apis/common/training_types.py
@ -1,47 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from datetime import datetime
-
-from pydantic import BaseModel
-
-from llama_stack.schema_utils import json_schema_type
-
-
-@json_schema_type
-class PostTrainingMetric(BaseModel):
-    """Training metrics captured during post-training jobs.
-
-    :param epoch: Training epoch number
-    :param train_loss: Loss value on the training dataset
-    :param validation_loss: Loss value on the validation dataset
-    :param perplexity: Perplexity metric indicating model confidence
-    """
-
-    epoch: int
-    train_loss: float
-    validation_loss: float
-    perplexity: float
-
-
-@json_schema_type
-class Checkpoint(BaseModel):
-    """Checkpoint created during training runs.
-
-    :param identifier: Unique identifier for the checkpoint
-    :param created_at: Timestamp when the checkpoint was created
-    :param epoch: Training epoch when the checkpoint was saved
-    :param post_training_job_id: Identifier of the training job that created this checkpoint
-    :param path: File system path where the checkpoint is stored
-    :param training_metrics: (Optional) Training metrics associated with this checkpoint
-    """
-
-    identifier: str
-    created_at: datetime
-    epoch: int
-    post_training_job_id: str
-    path: str
-    training_metrics: PostTrainingMetric | None = None
--- a/src/llama_stack/apis/common/type_system.py
+++ b/src/llama_stack/apis/common/type_system.py
@ -1,158 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Annotated, Literal
-
-from pydantic import BaseModel, Field
-
-from llama_stack.schema_utils import json_schema_type, register_schema
-
-
-@json_schema_type
-class StringType(BaseModel):
-    """Parameter type for string values.
-
-    :param type: Discriminator type. Always "string"
-    """
-
-    type: Literal["string"] = "string"
-
-
-@json_schema_type
-class NumberType(BaseModel):
-    """Parameter type for numeric values.
-
-    :param type: Discriminator type. Always "number"
-    """
-
-    type: Literal["number"] = "number"
-
-
-@json_schema_type
-class BooleanType(BaseModel):
-    """Parameter type for boolean values.
-
-    :param type: Discriminator type. Always "boolean"
-    """
-
-    type: Literal["boolean"] = "boolean"
-
-
-@json_schema_type
-class ArrayType(BaseModel):
-    """Parameter type for array values.
-
-    :param type: Discriminator type. Always "array"
-    """
-
-    type: Literal["array"] = "array"
-
-
-@json_schema_type
-class ObjectType(BaseModel):
-    """Parameter type for object values.
-
-    :param type: Discriminator type. Always "object"
-    """
-
-    type: Literal["object"] = "object"
-
-
-@json_schema_type
-class JsonType(BaseModel):
-    """Parameter type for JSON values.
-
-    :param type: Discriminator type. Always "json"
-    """
-
-    type: Literal["json"] = "json"
-
-
-@json_schema_type
-class UnionType(BaseModel):
-    """Parameter type for union values.
-
-    :param type: Discriminator type. Always "union"
-    """
-
-    type: Literal["union"] = "union"
-
-
-@json_schema_type
-class ChatCompletionInputType(BaseModel):
-    """Parameter type for chat completion input.
-
-    :param type: Discriminator type. Always "chat_completion_input"
-    """
-
-    # expects List[Message] for messages
-    type: Literal["chat_completion_input"] = "chat_completion_input"
-
-
-@json_schema_type
-class CompletionInputType(BaseModel):
-    """Parameter type for completion input.
-
-    :param type: Discriminator type. Always "completion_input"
-    """
-
-    # expects InterleavedTextMedia for content
-    type: Literal["completion_input"] = "completion_input"
-
-
-@json_schema_type
-class AgentTurnInputType(BaseModel):
-    """Parameter type for agent turn input.
-
-    :param type: Discriminator type. Always "agent_turn_input"
-    """
-
-    # expects List[Message] for messages (may also include attachments?)
-    type: Literal["agent_turn_input"] = "agent_turn_input"
-
-
-@json_schema_type
-class DialogType(BaseModel):
-    """Parameter type for dialog data with semantic output labels.
-
-    :param type: Discriminator type. Always "dialog"
-    """
-
-    # expects List[Message] for messages
-    # this type semantically contains the output label whereas ChatCompletionInputType does not
-    type: Literal["dialog"] = "dialog"
-
-
-ParamType = Annotated[
-    StringType
-    | NumberType
-    | BooleanType
-    | ArrayType
-    | ObjectType
-    | JsonType
-    | UnionType
-    | ChatCompletionInputType
-    | CompletionInputType
-    | AgentTurnInputType,
-    Field(discriminator="type"),
-]
-register_schema(ParamType, name="ParamType")
-
-"""
-# TODO: recursive definition of ParamType in these containers
-# will cause infinite recursion in OpenAPI generation script
-# since we are going with ChatCompletionInputType and CompletionInputType
-# we don't need to worry about ArrayType/ObjectType/UnionType for now
-ArrayType.model_rebuild()
-ObjectType.model_rebuild()
-UnionType.model_rebuild()
-
-
-class CustomType(BaseModel):
-pylint: disable=syntax-error
-    type: Literal["custom"] = "custom"
-    validator_class: str
-"""
--- a/src/llama_stack/apis/conversations/init.py
+++ b/src/llama_stack/apis/conversations/init.py
@ -1,31 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .conversations import (
-    Conversation,
-    ConversationCreateRequest,
-    ConversationDeletedResource,
-    ConversationItem,
-    ConversationItemCreateRequest,
-    ConversationItemDeletedResource,
-    ConversationItemList,
-    Conversations,
-    ConversationUpdateRequest,
-    Metadata,
-)
-
-__all__ = [
-    "Conversation",
-    "ConversationCreateRequest",
-    "ConversationDeletedResource",
-    "ConversationItem",
-    "ConversationItemCreateRequest",
-    "ConversationItemDeletedResource",
-    "ConversationItemList",
-    "Conversations",
-    "ConversationUpdateRequest",
-    "Metadata",
-]
--- a/src/llama_stack/apis/conversations/conversations.py
+++ b/src/llama_stack/apis/conversations/conversations.py
@ -1,298 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import StrEnum
-from typing import Annotated, Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.agents.openai_responses import (
-    OpenAIResponseInputFunctionToolCallOutput,
-    OpenAIResponseMCPApprovalRequest,
-    OpenAIResponseMCPApprovalResponse,
-    OpenAIResponseMessage,
-    OpenAIResponseOutputMessageFileSearchToolCall,
-    OpenAIResponseOutputMessageFunctionToolCall,
-    OpenAIResponseOutputMessageMCPCall,
-    OpenAIResponseOutputMessageMCPListTools,
-    OpenAIResponseOutputMessageWebSearchToolCall,
-)
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-Metadata = dict[str, str]
-
-
-@json_schema_type
-class Conversation(BaseModel):
-    """OpenAI-compatible conversation object."""
-
-    id: str = Field(..., description="The unique ID of the conversation.")
-    object: Literal["conversation"] = Field(
-        default="conversation", description="The object type, which is always conversation."
-    )
-    created_at: int = Field(
-        ..., description="The time at which the conversation was created, measured in seconds since the Unix epoch."
-    )
-    metadata: Metadata | None = Field(
-        default=None,
-        description="Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard.",
-    )
-    items: list[dict] | None = Field(
-        default=None,
-        description="Initial items to include in the conversation context. You may add up to 20 items at a time.",
-    )
-
-
-@json_schema_type
-class ConversationMessage(BaseModel):
-    """OpenAI-compatible message item for conversations."""
-
-    id: str = Field(..., description="unique identifier for this message")
-    content: list[dict] = Field(..., description="message content")
-    role: str = Field(..., description="message role")
-    status: str = Field(..., description="message status")
-    type: Literal["message"] = "message"
-    object: Literal["message"] = "message"
-
-
-ConversationItem = Annotated[
-    OpenAIResponseMessage
-    | OpenAIResponseOutputMessageWebSearchToolCall
-    | OpenAIResponseOutputMessageFileSearchToolCall
-    | OpenAIResponseOutputMessageFunctionToolCall
-    | OpenAIResponseInputFunctionToolCallOutput
-    | OpenAIResponseMCPApprovalRequest
-    | OpenAIResponseMCPApprovalResponse
-    | OpenAIResponseOutputMessageMCPCall
-    | OpenAIResponseOutputMessageMCPListTools
-    | OpenAIResponseOutputMessageMCPCall
-    | OpenAIResponseOutputMessageMCPListTools,
-    Field(discriminator="type"),
-]
-register_schema(ConversationItem, name="ConversationItem")
-
-# Using OpenAI types directly caused issues but some notes for reference:
-# Note that ConversationItem is a Annotated Union of the types below:
-# from openai.types.responses import *
-# from openai.types.responses.response_item import *
-# from openai.types.conversations import ConversationItem
-# f = [
-#     ResponseFunctionToolCallItem,
-#     ResponseFunctionToolCallOutputItem,
-#     ResponseFileSearchToolCall,
-#     ResponseFunctionWebSearch,
-#     ImageGenerationCall,
-#     ResponseComputerToolCall,
-#     ResponseComputerToolCallOutputItem,
-#     ResponseReasoningItem,
-#     ResponseCodeInterpreterToolCall,
-#     LocalShellCall,
-#     LocalShellCallOutput,
-#     McpListTools,
-#     McpApprovalRequest,
-#     McpApprovalResponse,
-#     McpCall,
-#     ResponseCustomToolCall,
-#     ResponseCustomToolCallOutput
-# ]
-
-
-@json_schema_type
-class ConversationCreateRequest(BaseModel):
-    """Request body for creating a conversation."""
-
-    items: list[ConversationItem] | None = Field(
-        default=[],
-        description="Initial items to include in the conversation context. You may add up to 20 items at a time.",
-        max_length=20,
-    )
-    metadata: Metadata | None = Field(
-        default={},
-        description="Set of 16 key-value pairs that can be attached to an object. Useful for storing additional information",
-        max_length=16,
-    )
-
-
-@json_schema_type
-class ConversationUpdateRequest(BaseModel):
-    """Request body for updating a conversation."""
-
-    metadata: Metadata = Field(
-        ...,
-        description="Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters.",
-    )
-
-
-@json_schema_type
-class ConversationDeletedResource(BaseModel):
-    """Response for deleted conversation."""
-
-    id: str = Field(..., description="The deleted conversation identifier")
-    object: str = Field(default="conversation.deleted", description="Object type")
-    deleted: bool = Field(default=True, description="Whether the object was deleted")
-
-
-@json_schema_type
-class ConversationItemCreateRequest(BaseModel):
-    """Request body for creating conversation items."""
-
-    items: list[ConversationItem] = Field(
-        ...,
-        description="Items to include in the conversation context. You may add up to 20 items at a time.",
-        max_length=20,
-    )
-
-
-class ConversationItemInclude(StrEnum):
-    """
-    Specify additional output data to include in the model response.
-    """
-
-    web_search_call_action_sources = "web_search_call.action.sources"
-    code_interpreter_call_outputs = "code_interpreter_call.outputs"
-    computer_call_output_output_image_url = "computer_call_output.output.image_url"
-    file_search_call_results = "file_search_call.results"
-    message_input_image_image_url = "message.input_image.image_url"
-    message_output_text_logprobs = "message.output_text.logprobs"
-    reasoning_encrypted_content = "reasoning.encrypted_content"
-
-
-@json_schema_type
-class ConversationItemList(BaseModel):
-    """List of conversation items with pagination."""
-
-    object: str = Field(default="list", description="Object type")
-    data: list[ConversationItem] = Field(..., description="List of conversation items")
-    first_id: str | None = Field(default=None, description="The ID of the first item in the list")
-    last_id: str | None = Field(default=None, description="The ID of the last item in the list")
-    has_more: bool = Field(default=False, description="Whether there are more items available")
-
-
-@json_schema_type
-class ConversationItemDeletedResource(BaseModel):
-    """Response for deleted conversation item."""
-
-    id: str = Field(..., description="The deleted item identifier")
-    object: str = Field(default="conversation.item.deleted", description="Object type")
-    deleted: bool = Field(default=True, description="Whether the object was deleted")
-
-
-@runtime_checkable
-@trace_protocol
-class Conversations(Protocol):
-    """Conversations
-
-    Protocol for conversation management operations."""
-
-    @webmethod(route="/conversations", method="POST", level=LLAMA_STACK_API_V1)
-    async def create_conversation(
-        self, items: list[ConversationItem] | None = None, metadata: Metadata | None = None
-    ) -> Conversation:
-        """Create a conversation.
-
-        Create a conversation.
-
-        :param items: Initial items to include in the conversation context.
-        :param metadata: Set of key-value pairs that can be attached to an object.
-        :returns: The created conversation object.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_conversation(self, conversation_id: str) -> Conversation:
-        """Retrieve a conversation.
-
-        Get a conversation with the given ID.
-
-        :param conversation_id: The conversation identifier.
-        :returns: The conversation object.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}", method="POST", level=LLAMA_STACK_API_V1)
-    async def update_conversation(self, conversation_id: str, metadata: Metadata) -> Conversation:
-        """Update a conversation.
-
-        Update a conversation's metadata with the given ID.
-
-        :param conversation_id: The conversation identifier.
-        :param metadata: Set of key-value pairs that can be attached to an object.
-        :returns: The updated conversation object.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def openai_delete_conversation(self, conversation_id: str) -> ConversationDeletedResource:
-        """Delete a conversation.
-
-        Delete a conversation with the given ID.
-
-        :param conversation_id: The conversation identifier.
-        :returns: The deleted conversation resource.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}/items", method="POST", level=LLAMA_STACK_API_V1)
-    async def add_items(self, conversation_id: str, items: list[ConversationItem]) -> ConversationItemList:
-        """Create items.
-
-        Create items in the conversation.
-
-        :param conversation_id: The conversation identifier.
-        :param items: Items to include in the conversation context.
-        :returns: List of created items.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}/items/{item_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def retrieve(self, conversation_id: str, item_id: str) -> ConversationItem:
-        """Retrieve an item.
-
-        Retrieve a conversation item.
-
-        :param conversation_id: The conversation identifier.
-        :param item_id: The item identifier.
-        :returns: The conversation item.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}/items", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_items(
-        self,
-        conversation_id: str,
-        after: str | None = None,
-        include: list[ConversationItemInclude] | None = None,
-        limit: int | None = None,
-        order: Literal["asc", "desc"] | None = None,
-    ) -> ConversationItemList:
-        """List items.
-
-        List items in the conversation.
-
-        :param conversation_id: The conversation identifier.
-        :param after: An item ID to list items after, used in pagination.
-        :param include: Specify additional output data to include in the response.
-        :param limit: A limit on the number of objects to be returned (1-100, default 20).
-        :param order: The order to return items in (asc or desc, default desc).
-        :returns: List of conversation items.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}/items/{item_id}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def openai_delete_conversation_item(
-        self, conversation_id: str, item_id: str
-    ) -> ConversationItemDeletedResource:
-        """Delete an item.
-
-        Delete a conversation item.
-
-        :param conversation_id: The conversation identifier.
-        :param item_id: The item identifier.
-        :returns: The deleted item resource.
-        """
-        ...
--- a/src/llama_stack/apis/datasetio/datasetio.py
+++ b/src/llama_stack/apis/datasetio/datasetio.py
@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Protocol, runtime_checkable
-
-from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.apis.datasets import Dataset
-from llama_stack.apis.version import LLAMA_STACK_API_V1BETA
-from llama_stack.schema_utils import webmethod
-
-
-class DatasetStore(Protocol):
-    def get_dataset(self, dataset_id: str) -> Dataset: ...
-
-
-@runtime_checkable
-class DatasetIO(Protocol):
-    # keeping for aligning with inference/safety, but this is not used
-    dataset_store: DatasetStore
-
-    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1BETA)
-    async def iterrows(
-        self,
-        dataset_id: str,
-        start_index: int | None = None,
-        limit: int | None = None,
-    ) -> PaginatedResponse:
-        """Get a paginated list of rows from a dataset.
-
-        Uses offset-based pagination where:
-        - start_index: The starting index (0-based). If None, starts from beginning.
-        - limit: Number of items to return. If None or -1, returns all items.
-
-        The response includes:
-        - data: List of items for the current page.
-        - has_more: Whether there are more items available after this set.
-
-        :param dataset_id: The ID of the dataset to get the rows from.
-        :param start_index: Index into dataset for the first row to get. Get all rows if None.
-        :param limit: The number of rows to get.
-        :returns: A PaginatedResponse.
-        """
-        ...
-
-    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST", level=LLAMA_STACK_API_V1BETA)
-    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
-        """Append rows to a dataset.
-
-        :param dataset_id: The ID of the dataset to append the rows to.
-        :param rows: The rows to append to the dataset.
-        """
-        ...
--- a/src/llama_stack/apis/datasets/init.py
+++ b/src/llama_stack/apis/datasets/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .datasets import *
--- a/src/llama_stack/apis/datasets/datasets.py
+++ b/src/llama_stack/apis/datasets/datasets.py
@ -1,247 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum, StrEnum
-from typing import Annotated, Any, Literal, Protocol
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1BETA
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-class DatasetPurpose(StrEnum):
-    """
-    Purpose of the dataset. Each purpose has a required input data schema.
-
-    :cvar post-training/messages: The dataset contains messages used for post-training.
-        {
-            "messages": [
-                {"role": "user", "content": "Hello, world!"},
-                {"role": "assistant", "content": "Hello, world!"},
-            ]
-        }
-    :cvar eval/question-answer: The dataset contains a question column and an answer column.
-        {
-            "question": "What is the capital of France?",
-            "answer": "Paris"
-        }
-    :cvar eval/messages-answer: The dataset contains a messages column with list of messages and an answer column.
-        {
-            "messages": [
-                {"role": "user", "content": "Hello, my name is John Doe."},
-                {"role": "assistant", "content": "Hello, John Doe. How can I help you today?"},
-                {"role": "user", "content": "What's my name?"},
-            ],
-            "answer": "John Doe"
-        }
-    """
-
-    post_training_messages = "post-training/messages"
-    eval_question_answer = "eval/question-answer"
-    eval_messages_answer = "eval/messages-answer"
-
-    # TODO: add more schemas here
-
-
-class DatasetType(Enum):
-    """
-    Type of the dataset source.
-    :cvar uri: The dataset can be obtained from a URI.
-    :cvar rows: The dataset is stored in rows.
-    """
-
-    uri = "uri"
-    rows = "rows"
-
-
-@json_schema_type
-class URIDataSource(BaseModel):
-    """A dataset that can be obtained from a URI.
-    :param uri: The dataset can be obtained from a URI. E.g.
-        - "https://mywebsite.com/mydata.jsonl"
-        - "lsfs://mydata.jsonl"
-        - "data:csv;base64,{base64_content}"
-    """
-
-    type: Literal["uri"] = "uri"
-    uri: str
-
-
-@json_schema_type
-class RowsDataSource(BaseModel):
-    """A dataset stored in rows.
-    :param rows: The dataset is stored in rows. E.g.
-        - [
-            {"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}
-        ]
-    """
-
-    type: Literal["rows"] = "rows"
-    rows: list[dict[str, Any]]
-
-
-DataSource = Annotated[
-    URIDataSource | RowsDataSource,
-    Field(discriminator="type"),
-]
-register_schema(DataSource, name="DataSource")
-
-
-class CommonDatasetFields(BaseModel):
-    """
-    Common fields for a dataset.
-
-    :param purpose: Purpose of the dataset indicating its intended use
-    :param source: Data source configuration for the dataset
-    :param metadata: Additional metadata for the dataset
-    """
-
-    purpose: DatasetPurpose
-    source: DataSource
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this dataset",
-    )
-
-
-@json_schema_type
-class Dataset(CommonDatasetFields, Resource):
-    """Dataset resource for storing and accessing training or evaluation data.
-
-    :param type: Type of resource, always 'dataset' for datasets
-    """
-
-    type: Literal[ResourceType.dataset] = ResourceType.dataset
-
-    @property
-    def dataset_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_dataset_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class DatasetInput(CommonDatasetFields, BaseModel):
-    """Input parameters for dataset operations.
-
-    :param dataset_id: Unique identifier for the dataset
-    """
-
-    dataset_id: str
-
-
-class ListDatasetsResponse(BaseModel):
-    """Response from listing datasets.
-
-    :param data: List of datasets
-    """
-
-    data: list[Dataset]
-
-
-class Datasets(Protocol):
-    @webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1BETA)
-    async def register_dataset(
-        self,
-        purpose: DatasetPurpose,
-        source: DataSource,
-        metadata: dict[str, Any] | None = None,
-        dataset_id: str | None = None,
-    ) -> Dataset:
-        """
-        Register a new dataset.
-
-        :param purpose: The purpose of the dataset.
-        One of:
-            - "post-training/messages": The dataset contains a messages column with list of messages for post-training.
-                {
-                    "messages": [
-                        {"role": "user", "content": "Hello, world!"},
-                        {"role": "assistant", "content": "Hello, world!"},
-                    ]
-                }
-            - "eval/question-answer": The dataset contains a question column and an answer column for evaluation.
-                {
-                    "question": "What is the capital of France?",
-                    "answer": "Paris"
-                }
-            - "eval/messages-answer": The dataset contains a messages column with list of messages and an answer column for evaluation.
-                {
-                    "messages": [
-                        {"role": "user", "content": "Hello, my name is John Doe."},
-                        {"role": "assistant", "content": "Hello, John Doe. How can I help you today?"},
-                        {"role": "user", "content": "What's my name?"},
-                    ],
-                    "answer": "John Doe"
-                }
-        :param source: The data source of the dataset. Ensure that the data source schema is compatible with the purpose of the dataset. Examples:
-           - {
-               "type": "uri",
-               "uri": "https://mywebsite.com/mydata.jsonl"
-           }
-           - {
-               "type": "uri",
-               "uri": "lsfs://mydata.jsonl"
-           }
-           - {
-               "type": "uri",
-               "uri": "data:csv;base64,{base64_content}"
-           }
-           - {
-               "type": "uri",
-               "uri": "huggingface://llamastack/simpleqa?split=train"
-           }
-           - {
-               "type": "rows",
-               "rows": [
-                   {
-                       "messages": [
-                           {"role": "user", "content": "Hello, world!"},
-                           {"role": "assistant", "content": "Hello, world!"},
-                       ]
-                   }
-               ]
-           }
-        :param metadata: The metadata for the dataset.
-           - E.g. {"description": "My dataset"}.
-        :param dataset_id: The ID of the dataset. If not provided, an ID will be generated.
-        :returns: A Dataset.
-        """
-        ...
-
-    @webmethod(route="/datasets/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1BETA)
-    async def get_dataset(
-        self,
-        dataset_id: str,
-    ) -> Dataset:
-        """Get a dataset by its ID.
-
-        :param dataset_id: The ID of the dataset to get.
-        :returns: A Dataset.
-        """
-        ...
-
-    @webmethod(route="/datasets", method="GET", level=LLAMA_STACK_API_V1BETA)
-    async def list_datasets(self) -> ListDatasetsResponse:
-        """List all datasets.
-
-        :returns: A ListDatasetsResponse.
-        """
-        ...
-
-    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1BETA)
-    async def unregister_dataset(
-        self,
-        dataset_id: str,
-    ) -> None:
-        """Unregister a dataset by its ID.
-
-        :param dataset_id: The ID of the dataset to unregister.
-        """
-        ...
--- a/src/llama_stack/apis/datatypes.py
+++ b/src/llama_stack/apis/datatypes.py
@ -1,158 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum, EnumMeta
-
-from pydantic import BaseModel, Field
-
-from llama_stack.schema_utils import json_schema_type
-
-
-class DynamicApiMeta(EnumMeta):
-    def __new__(cls, name, bases, namespace):
-        # Store the original enum values
-        original_values = {k: v for k, v in namespace.items() if not k.startswith("_")}
-
-        # Create the enum class
-        cls = super().__new__(cls, name, bases, namespace)
-
-        # Store the original values for reference
-        cls._original_values = original_values
-        # Initialize _dynamic_values
-        cls._dynamic_values = {}
-
-        return cls
-
-    def __call__(cls, value):
-        try:
-            return super().__call__(value)
-        except ValueError as e:
-            # If this value was already dynamically added, return it
-            if value in cls._dynamic_values:
-                return cls._dynamic_values[value]
-
-            # If the value doesn't exist, create a new enum member
-            # Create a new member name from the value
-            member_name = value.lower().replace("-", "_")
-
-            # If this member name already exists in the enum, return the existing member
-            if member_name in cls._member_map_:
-                return cls._member_map_[member_name]
-
-            # Instead of creating a new member, raise ValueError to force users to use Api.add() to
-            # register new APIs explicitly
-            raise ValueError(f"API '{value}' does not exist. Use Api.add() to register new APIs.") from e
-
-    def __iter__(cls):
-        # Allow iteration over both static and dynamic members
-        yield from super().__iter__()
-        if hasattr(cls, "_dynamic_values"):
-            yield from cls._dynamic_values.values()
-
-    def add(cls, value):
-        """
-        Add a new API to the enum.
-        Used to register external APIs.
-        """
-        member_name = value.lower().replace("-", "_")
-
-        # If this member name already exists in the enum, return it
-        if member_name in cls._member_map_:
-            return cls._member_map_[member_name]
-
-        # Create a new enum member
-        member = object.__new__(cls)
-        member._name_ = member_name
-        member._value_ = value
-
-        # Add it to the enum class
-        cls._member_map_[member_name] = member
-        cls._member_names_.append(member_name)
-        cls._member_type_ = str
-
-        # Store it in our dynamic values
-        cls._dynamic_values[value] = member
-
-        return member
-
-
-@json_schema_type
-class Api(Enum, metaclass=DynamicApiMeta):
-    """Enumeration of all available APIs in the Llama Stack system.
-    :cvar providers: Provider management and configuration
-    :cvar inference: Text generation, chat completions, and embeddings
-    :cvar safety: Content moderation and safety shields
-    :cvar agents: Agent orchestration and execution
-    :cvar batches: Batch processing for asynchronous API requests
-    :cvar vector_io: Vector database operations and queries
-    :cvar datasetio: Dataset input/output operations
-    :cvar scoring: Model output evaluation and scoring
-    :cvar eval: Model evaluation and benchmarking framework
-    :cvar post_training: Fine-tuning and model training
-    :cvar tool_runtime: Tool execution and management
-    :cvar telemetry: Observability and system monitoring
-    :cvar models: Model metadata and management
-    :cvar shields: Safety shield implementations
-    :cvar datasets: Dataset creation and management
-    :cvar scoring_functions: Scoring function definitions
-    :cvar benchmarks: Benchmark suite management
-    :cvar tool_groups: Tool group organization
-    :cvar files: File storage and management
-    :cvar prompts: Prompt versions and management
-    :cvar inspect: Built-in system inspection and introspection
-    """
-
-    providers = "providers"
-    inference = "inference"
-    safety = "safety"
-    agents = "agents"
-    batches = "batches"
-    vector_io = "vector_io"
-    datasetio = "datasetio"
-    scoring = "scoring"
-    eval = "eval"
-    post_training = "post_training"
-    tool_runtime = "tool_runtime"
-
-    models = "models"
-    shields = "shields"
-    vector_stores = "vector_stores"  # only used for routing table
-    datasets = "datasets"
-    scoring_functions = "scoring_functions"
-    benchmarks = "benchmarks"
-    tool_groups = "tool_groups"
-    files = "files"
-    prompts = "prompts"
-    conversations = "conversations"
-
-    # built-in API
-    inspect = "inspect"
-
-
-@json_schema_type
-class Error(BaseModel):
-    """
-    Error response from the API. Roughly follows RFC 7807.
-
-    :param status: HTTP status code
-    :param title: Error title, a short summary of the error which is invariant for an error type
-    :param detail: Error detail, a longer human-readable description of the error
-    :param instance: (Optional) A URL which can be used to retrieve more information about the specific occurrence of the error
-    """
-
-    status: int
-    title: str
-    detail: str
-    instance: str | None = None
-
-
-class ExternalApiSpec(BaseModel):
-    """Specification for an external API implementation."""
-
-    module: str = Field(..., description="Python module containing the API implementation")
-    name: str = Field(..., description="Name of the API")
-    pip_packages: list[str] = Field(default=[], description="List of pip packages to install the API")
-    protocol: str = Field(..., description="Name of the protocol class for the API")
--- a/src/llama_stack/apis/eval/init.py
+++ b/src/llama_stack/apis/eval/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .eval import *
--- a/src/llama_stack/apis/eval/eval.py
+++ b/src/llama_stack/apis/eval/eval.py
@ -1,150 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Annotated, Any, Literal, Protocol
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.agents import AgentConfig
-from llama_stack.apis.common.job_types import Job
-from llama_stack.apis.inference import SamplingParams, SystemMessage
-from llama_stack.apis.scoring import ScoringResult
-from llama_stack.apis.scoring_functions import ScoringFnParams
-from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-@json_schema_type
-class ModelCandidate(BaseModel):
-    """A model candidate for evaluation.
-
-    :param model: The model ID to evaluate.
-    :param sampling_params: The sampling parameters for the model.
-    :param system_message: (Optional) The system message providing instructions or context to the model.
-    """
-
-    type: Literal["model"] = "model"
-    model: str
-    sampling_params: SamplingParams
-    system_message: SystemMessage | None = None
-
-
-@json_schema_type
-class AgentCandidate(BaseModel):
-    """An agent candidate for evaluation.
-
-    :param config: The configuration for the agent candidate.
-    """
-
-    type: Literal["agent"] = "agent"
-    config: AgentConfig
-
-
-EvalCandidate = Annotated[ModelCandidate | AgentCandidate, Field(discriminator="type")]
-register_schema(EvalCandidate, name="EvalCandidate")
-
-
-@json_schema_type
-class BenchmarkConfig(BaseModel):
-    """A benchmark configuration for evaluation.
-
-    :param eval_candidate: The candidate to evaluate.
-    :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run
-    :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated
-    """
-
-    eval_candidate: EvalCandidate
-    scoring_params: dict[str, ScoringFnParams] = Field(
-        description="Map between scoring function id and parameters for each scoring function you want to run",
-        default_factory=dict,
-    )
-    num_examples: int | None = Field(
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        default=None,
-    )
-    # we could optinally add any specific dataset config here
-
-
-@json_schema_type
-class EvaluateResponse(BaseModel):
-    """The response from an evaluation.
-
-    :param generations: The generations from the evaluation.
-    :param scores: The scores from the evaluation.
-    """
-
-    generations: list[dict[str, Any]]
-    # each key in the dict is a scoring function name
-    scores: dict[str, ScoringResult]
-
-
-class Eval(Protocol):
-    """Evaluations
-
-    Llama Stack Evaluation API for running evaluations on model and agent candidates."""
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def run_eval(
-        self,
-        benchmark_id: str,
-        benchmark_config: BenchmarkConfig,
-    ) -> Job:
-        """Run an evaluation on a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param benchmark_config: The configuration for the benchmark.
-        :returns: The job that was created to run the evaluation.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def evaluate_rows(
-        self,
-        benchmark_id: str,
-        input_rows: list[dict[str, Any]],
-        scoring_functions: list[str],
-        benchmark_config: BenchmarkConfig,
-    ) -> EvaluateResponse:
-        """Evaluate a list of rows on a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param input_rows: The rows to evaluate.
-        :param scoring_functions: The scoring functions to use for the evaluation.
-        :param benchmark_config: The configuration for the benchmark.
-        :returns: EvaluateResponse object containing generations and scores.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def job_status(self, benchmark_id: str, job_id: str) -> Job:
-        """Get the status of a job.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param job_id: The ID of the job to get the status of.
-        :returns: The status of the evaluation job.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
-    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
-        """Cancel a job.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param job_id: The ID of the job to cancel.
-        """
-        ...
-
-    @webmethod(
-        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET", level=LLAMA_STACK_API_V1ALPHA
-    )
-    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
-        """Get the result of a job.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param job_id: The ID of the job to get the result of.
-        :returns: The result of the job.
-        """
-        ...
--- a/src/llama_stack/apis/files/init.py
+++ b/src/llama_stack/apis/files/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .files import *
--- a/src/llama_stack/apis/files/files.py
+++ b/src/llama_stack/apis/files/files.py
@ -1,194 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import StrEnum
-from typing import Annotated, ClassVar, Literal, Protocol, runtime_checkable
-
-from fastapi import File, Form, Response, UploadFile
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.common.responses import Order
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-# OpenAI Files API Models
-class OpenAIFilePurpose(StrEnum):
-    """
-    Valid purpose values for OpenAI Files API.
-    """
-
-    ASSISTANTS = "assistants"
-    BATCH = "batch"
-    # TODO: Add other purposes as needed
-
-
-@json_schema_type
-class OpenAIFileObject(BaseModel):
-    """
-    OpenAI File object as defined in the OpenAI Files API.
-
-    :param object: The object type, which is always "file"
-    :param id: The file identifier, which can be referenced in the API endpoints
-    :param bytes: The size of the file, in bytes
-    :param created_at: The Unix timestamp (in seconds) for when the file was created
-    :param expires_at: The Unix timestamp (in seconds) for when the file expires
-    :param filename: The name of the file
-    :param purpose: The intended purpose of the file
-    """
-
-    object: Literal["file"] = "file"
-    id: str
-    bytes: int
-    created_at: int
-    expires_at: int
-    filename: str
-    purpose: OpenAIFilePurpose
-
-
-@json_schema_type
-class ExpiresAfter(BaseModel):
-    """
-    Control expiration of uploaded files.
-
-    Params:
-     - anchor, must be "created_at"
-     - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
-    """
-
-    MIN: ClassVar[int] = 3600  # 1 hour
-    MAX: ClassVar[int] = 2592000  # 30 days
-
-    anchor: Literal["created_at"]
-    seconds: int = Field(..., ge=3600, le=2592000)
-
-
-@json_schema_type
-class ListOpenAIFileResponse(BaseModel):
-    """
-    Response for listing files in OpenAI Files API.
-
-    :param data: List of file objects
-    :param has_more: Whether there are more files available beyond this page
-    :param first_id: ID of the first file in the list for pagination
-    :param last_id: ID of the last file in the list for pagination
-    :param object: The object type, which is always "list"
-    """
-
-    data: list[OpenAIFileObject]
-    has_more: bool
-    first_id: str
-    last_id: str
-    object: Literal["list"] = "list"
-
-
-@json_schema_type
-class OpenAIFileDeleteResponse(BaseModel):
-    """
-    Response for deleting a file in OpenAI Files API.
-
-    :param id: The file identifier that was deleted
-    :param object: The object type, which is always "file"
-    :param deleted: Whether the file was successfully deleted
-    """
-
-    id: str
-    object: Literal["file"] = "file"
-    deleted: bool
-
-
-@runtime_checkable
-@trace_protocol
-class Files(Protocol):
-    """Files
-
-    This API is used to upload documents that can be used with other Llama Stack APIs.
-    """
-
-    # OpenAI Files API Endpoints
-    @webmethod(route="/files", method="POST", level=LLAMA_STACK_API_V1)
-    async def openai_upload_file(
-        self,
-        file: Annotated[UploadFile, File()],
-        purpose: Annotated[OpenAIFilePurpose, Form()],
-        expires_after: Annotated[ExpiresAfter | None, Form()] = None,
-    ) -> OpenAIFileObject:
-        """Upload file.
-
-        Upload a file that can be used across various endpoints.
-
-        The file upload should be a multipart form request with:
-        - file: The File object (not file name) to be uploaded.
-        - purpose: The intended purpose of the uploaded file.
-        - expires_after: Optional form values describing expiration for the file.
-
-        :param file: The uploaded file object containing content and metadata (filename, content_type, etc.).
-        :param purpose: The intended purpose of the uploaded file (e.g., "assistants", "fine-tune").
-        :param expires_after: Optional form values describing expiration for the file.
-        :returns: An OpenAIFileObject representing the uploaded file.
-        """
-        ...
-
-    @webmethod(route="/files", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_list_files(
-        self,
-        after: str | None = None,
-        limit: int | None = 10000,
-        order: Order | None = Order.desc,
-        purpose: OpenAIFilePurpose | None = None,
-    ) -> ListOpenAIFileResponse:
-        """List files.
-
-        Returns a list of files that belong to the user's organization.
-
-        :param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list. For instance, if you make a list request and receive 100 objects, ending with obj_foo, your subsequent call can include after=obj_foo in order to fetch the next page of the list.
-        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 10,000, and the default is 10,000.
-        :param order: Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.
-        :param purpose: Only return files with the given purpose.
-        :returns: An ListOpenAIFileResponse containing the list of files.
-        """
-        ...
-
-    @webmethod(route="/files/{file_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_retrieve_file(
-        self,
-        file_id: str,
-    ) -> OpenAIFileObject:
-        """Retrieve file.
-
-        Returns information about a specific file.
-
-        :param file_id: The ID of the file to use for this request.
-        :returns: An OpenAIFileObject containing file information.
-        """
-        ...
-
-    @webmethod(route="/files/{file_id}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def openai_delete_file(
-        self,
-        file_id: str,
-    ) -> OpenAIFileDeleteResponse:
-        """Delete file.
-
-        :param file_id: The ID of the file to use for this request.
-        :returns: An OpenAIFileDeleteResponse indicating successful deletion.
-        """
-        ...
-
-    @webmethod(route="/files/{file_id}/content", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_retrieve_file_content(
-        self,
-        file_id: str,
-    ) -> Response:
-        """Retrieve file content.
-
-        Returns the contents of the specified file.
-
-        :param file_id: The ID of the file to use for this request.
-        :returns: The raw file content as a binary response.
-        """
-        ...
--- a/src/llama_stack/apis/inference/init.py
+++ b/src/llama_stack/apis/inference/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .inference import *
--- a/src/llama_stack/apis/inference/event_logger.py
+++ b/src/llama_stack/apis/inference/event_logger.py
@ -1,43 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from termcolor import cprint
-
-from llama_stack.apis.inference import (
-    ChatCompletionResponseEventType,
-    ChatCompletionResponseStreamChunk,
-)
-
-
-class LogEvent:
-    def __init__(
-        self,
-        content: str = "",
-        end: str = "\n",
-        color="white",
-    ):
-        self.content = content
-        self.color = color
-        self.end = "\n" if end is None else end
-
-    def print(self, flush=True):
-        cprint(f"{self.content}", color=self.color, end=self.end, flush=flush)
-
-
-class EventLogger:
-    async def log(self, event_generator):
-        async for chunk in event_generator:
-            if isinstance(chunk, ChatCompletionResponseStreamChunk):
-                event = chunk.event
-                if event.event_type == ChatCompletionResponseEventType.start:
-                    yield LogEvent("Assistant> ", color="cyan", end="")
-                elif event.event_type == ChatCompletionResponseEventType.progress:
-                    yield LogEvent(event.delta, color="yellow", end="")
-                elif event.event_type == ChatCompletionResponseEventType.complete:
-                    yield LogEvent("")
-            else:
-                yield LogEvent("Assistant> ", color="cyan", end="")
-                yield LogEvent(chunk.completion_message.content, color="yellow")
--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
--- a/src/llama_stack/apis/inspect/init.py
+++ b/src/llama_stack/apis/inspect/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .inspect import *
--- a/src/llama_stack/apis/inspect/inspect.py
+++ b/src/llama_stack/apis/inspect/inspect.py
@ -1,102 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel
-
-from llama_stack.apis.version import (
-    LLAMA_STACK_API_V1,
-)
-from llama_stack.providers.datatypes import HealthStatus
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-# Valid values for the route filter parameter.
-# Actual API levels: v1, v1alpha, v1beta (filters by level, excludes deprecated)
-# Special filter value: "deprecated" (shows deprecated routes regardless of level)
-ApiFilter = Literal["v1", "v1alpha", "v1beta", "deprecated"]
-
-
-@json_schema_type
-class RouteInfo(BaseModel):
-    """Information about an API route including its path, method, and implementing providers.
-
-    :param route: The API endpoint path
-    :param method: HTTP method for the route
-    :param provider_types: List of provider types that implement this route
-    """
-
-    route: str
-    method: str
-    provider_types: list[str]
-
-
-@json_schema_type
-class HealthInfo(BaseModel):
-    """Health status information for the service.
-
-    :param status: Current health status of the service
-    """
-
-    status: HealthStatus
-
-
-@json_schema_type
-class VersionInfo(BaseModel):
-    """Version information for the service.
-
-    :param version: Version number of the service
-    """
-
-    version: str
-
-
-class ListRoutesResponse(BaseModel):
-    """Response containing a list of all available API routes.
-
-    :param data: List of available route information objects
-    """
-
-    data: list[RouteInfo]
-
-
-@runtime_checkable
-class Inspect(Protocol):
-    """Inspect
-
-    APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.
-    """
-
-    @webmethod(route="/inspect/routes", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_routes(self, api_filter: ApiFilter | None = None) -> ListRoutesResponse:
-        """List routes.
-
-        List all available API routes with their methods and implementing providers.
-
-        :param api_filter: Optional filter to control which routes are returned. Can be an API level ('v1', 'v1alpha', 'v1beta') to show non-deprecated routes at that level, or 'deprecated' to show deprecated routes across all levels. If not specified, returns only non-deprecated v1 routes.
-        :returns: Response containing information about all available routes.
-        """
-        ...
-
-    @webmethod(route="/health", method="GET", level=LLAMA_STACK_API_V1, require_authentication=False)
-    async def health(self) -> HealthInfo:
-        """Get health status.
-
-        Get the current health status of the service.
-
-        :returns: Health information indicating if the service is operational.
-        """
-        ...
-
-    @webmethod(route="/version", method="GET", level=LLAMA_STACK_API_V1, require_authentication=False)
-    async def version(self) -> VersionInfo:
-        """Get version.
-
-        Get the version of the service.
-
-        :returns: Version information containing the service version number.
-        """
-        ...
--- a/src/llama_stack/apis/models/init.py
+++ b/src/llama_stack/apis/models/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .models import *
--- a/src/llama_stack/apis/models/models.py
+++ b/src/llama_stack/apis/models/models.py
@ -1,172 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import StrEnum
-from typing import Any, Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel, ConfigDict, Field, field_validator
-
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-class CommonModelFields(BaseModel):
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this model",
-    )
-
-
-@json_schema_type
-class ModelType(StrEnum):
-    """Enumeration of supported model types in Llama Stack.
-    :cvar llm: Large language model for text generation and completion
-    :cvar embedding: Embedding model for converting text to vector representations
-    :cvar rerank: Reranking model for reordering documents based on their relevance to a query
-    """
-
-    llm = "llm"
-    embedding = "embedding"
-    rerank = "rerank"
-
-
-@json_schema_type
-class Model(CommonModelFields, Resource):
-    """A model resource representing an AI model registered in Llama Stack.
-
-    :param type: The resource type, always 'model' for model resources
-    :param model_type: The type of model (LLM or embedding model)
-    :param metadata: Any additional metadata for this model
-    :param identifier: Unique identifier for this resource in llama stack
-    :param provider_resource_id: Unique identifier for this resource in the provider
-    :param provider_id: ID of the provider that owns this resource
-    """
-
-    type: Literal[ResourceType.model] = ResourceType.model
-
-    @property
-    def model_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_model_id(self) -> str:
-        assert self.provider_resource_id is not None, "Provider resource ID must be set"
-        return self.provider_resource_id
-
-    model_config = ConfigDict(protected_namespaces=())
-
-    model_type: ModelType = Field(default=ModelType.llm)
-
-    @field_validator("provider_resource_id")
-    @classmethod
-    def validate_provider_resource_id(cls, v):
-        if v is None:
-            raise ValueError("provider_resource_id cannot be None")
-        return v
-
-
-class ModelInput(CommonModelFields):
-    model_id: str
-    provider_id: str | None = None
-    provider_model_id: str | None = None
-    model_type: ModelType | None = ModelType.llm
-    model_config = ConfigDict(protected_namespaces=())
-
-
-class ListModelsResponse(BaseModel):
-    data: list[Model]
-
-
-@json_schema_type
-class OpenAIModel(BaseModel):
-    """A model from OpenAI.
-
-    :id: The ID of the model
-    :object: The object type, which will be "model"
-    :created: The Unix timestamp in seconds when the model was created
-    :owned_by: The owner of the model
-    :custom_metadata: Llama Stack-specific metadata including model_type, provider info, and additional metadata
-    """
-
-    id: str
-    object: Literal["model"] = "model"
-    created: int
-    owned_by: str
-    custom_metadata: dict[str, Any] | None = None
-
-
-class OpenAIListModelsResponse(BaseModel):
-    data: list[OpenAIModel]
-
-
-@runtime_checkable
-@trace_protocol
-class Models(Protocol):
-    async def list_models(self) -> ListModelsResponse:
-        """List all models.
-
-        :returns: A ListModelsResponse.
-        """
-        ...
-
-    @webmethod(route="/models", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_list_models(self) -> OpenAIListModelsResponse:
-        """List models using the OpenAI API.
-
-        :returns: A OpenAIListModelsResponse.
-        """
-        ...
-
-    @webmethod(route="/models/{model_id:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_model(
-        self,
-        model_id: str,
-    ) -> Model:
-        """Get model.
-
-        Get a model by its identifier.
-
-        :param model_id: The identifier of the model to get.
-        :returns: A Model.
-        """
-        ...
-
-    @webmethod(route="/models", method="POST", level=LLAMA_STACK_API_V1)
-    async def register_model(
-        self,
-        model_id: str,
-        provider_model_id: str | None = None,
-        provider_id: str | None = None,
-        metadata: dict[str, Any] | None = None,
-        model_type: ModelType | None = None,
-    ) -> Model:
-        """Register model.
-
-        Register a model.
-
-        :param model_id: The identifier of the model to register.
-        :param provider_model_id: The identifier of the model in the provider.
-        :param provider_id: The identifier of the provider.
-        :param metadata: Any additional metadata for this model.
-        :param model_type: The type of model to register.
-        :returns: A Model.
-        """
-        ...
-
-    @webmethod(route="/models/{model_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def unregister_model(
-        self,
-        model_id: str,
-    ) -> None:
-        """Unregister model.
-
-        Unregister a model.
-
-        :param model_id: The identifier of the model to unregister.
-        """
-        ...
--- a/src/llama_stack/apis/post_training/init.py
+++ b/src/llama_stack/apis/post_training/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .post_training import *
--- a/src/llama_stack/apis/post_training/post_training.py
+++ b/src/llama_stack/apis/post_training/post_training.py
@ -1,368 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from datetime import datetime
-from enum import Enum
-from typing import Annotated, Any, Literal, Protocol
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.job_types import JobStatus
-from llama_stack.apis.common.training_types import Checkpoint
-from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-@json_schema_type
-class OptimizerType(Enum):
-    """Available optimizer algorithms for training.
-    :cvar adam: Adaptive Moment Estimation optimizer
-    :cvar adamw: AdamW optimizer with weight decay
-    :cvar sgd: Stochastic Gradient Descent optimizer
-    """
-
-    adam = "adam"
-    adamw = "adamw"
-    sgd = "sgd"
-
-
-@json_schema_type
-class DatasetFormat(Enum):
-    """Format of the training dataset.
-    :cvar instruct: Instruction-following format with prompt and completion
-    :cvar dialog: Multi-turn conversation format with messages
-    """
-
-    instruct = "instruct"
-    dialog = "dialog"
-
-
-@json_schema_type
-class DataConfig(BaseModel):
-    """Configuration for training data and data loading.
-
-    :param dataset_id: Unique identifier for the training dataset
-    :param batch_size: Number of samples per training batch
-    :param shuffle: Whether to shuffle the dataset during training
-    :param data_format: Format of the dataset (instruct or dialog)
-    :param validation_dataset_id: (Optional) Unique identifier for the validation dataset
-    :param packed: (Optional) Whether to pack multiple samples into a single sequence for efficiency
-    :param train_on_input: (Optional) Whether to compute loss on input tokens as well as output tokens
-    """
-
-    dataset_id: str
-    batch_size: int
-    shuffle: bool
-    data_format: DatasetFormat
-    validation_dataset_id: str | None = None
-    packed: bool | None = False
-    train_on_input: bool | None = False
-
-
-@json_schema_type
-class OptimizerConfig(BaseModel):
-    """Configuration parameters for the optimization algorithm.
-
-    :param optimizer_type: Type of optimizer to use (adam, adamw, or sgd)
-    :param lr: Learning rate for the optimizer
-    :param weight_decay: Weight decay coefficient for regularization
-    :param num_warmup_steps: Number of steps for learning rate warmup
-    """
-
-    optimizer_type: OptimizerType
-    lr: float
-    weight_decay: float
-    num_warmup_steps: int
-
-
-@json_schema_type
-class EfficiencyConfig(BaseModel):
-    """Configuration for memory and compute efficiency optimizations.
-
-    :param enable_activation_checkpointing: (Optional) Whether to use activation checkpointing to reduce memory usage
-    :param enable_activation_offloading: (Optional) Whether to offload activations to CPU to save GPU memory
-    :param memory_efficient_fsdp_wrap: (Optional) Whether to use memory-efficient FSDP wrapping
-    :param fsdp_cpu_offload: (Optional) Whether to offload FSDP parameters to CPU
-    """
-
-    enable_activation_checkpointing: bool | None = False
-    enable_activation_offloading: bool | None = False
-    memory_efficient_fsdp_wrap: bool | None = False
-    fsdp_cpu_offload: bool | None = False
-
-
-@json_schema_type
-class TrainingConfig(BaseModel):
-    """Comprehensive configuration for the training process.
-
-    :param n_epochs: Number of training epochs to run
-    :param max_steps_per_epoch: Maximum number of steps to run per epoch
-    :param gradient_accumulation_steps: Number of steps to accumulate gradients before updating
-    :param max_validation_steps: (Optional) Maximum number of validation steps per epoch
-    :param data_config: (Optional) Configuration for data loading and formatting
-    :param optimizer_config: (Optional) Configuration for the optimization algorithm
-    :param efficiency_config: (Optional) Configuration for memory and compute optimizations
-    :param dtype: (Optional) Data type for model parameters (bf16, fp16, fp32)
-    """
-
-    n_epochs: int
-    max_steps_per_epoch: int = 1
-    gradient_accumulation_steps: int = 1
-    max_validation_steps: int | None = 1
-    data_config: DataConfig | None = None
-    optimizer_config: OptimizerConfig | None = None
-    efficiency_config: EfficiencyConfig | None = None
-    dtype: str | None = "bf16"
-
-
-@json_schema_type
-class LoraFinetuningConfig(BaseModel):
-    """Configuration for Low-Rank Adaptation (LoRA) fine-tuning.
-
-    :param type: Algorithm type identifier, always "LoRA"
-    :param lora_attn_modules: List of attention module names to apply LoRA to
-    :param apply_lora_to_mlp: Whether to apply LoRA to MLP layers
-    :param apply_lora_to_output: Whether to apply LoRA to output projection layers
-    :param rank: Rank of the LoRA adaptation (lower rank = fewer parameters)
-    :param alpha: LoRA scaling parameter that controls adaptation strength
-    :param use_dora: (Optional) Whether to use DoRA (Weight-Decomposed Low-Rank Adaptation)
-    :param quantize_base: (Optional) Whether to quantize the base model weights
-    """
-
-    type: Literal["LoRA"] = "LoRA"
-    lora_attn_modules: list[str]
-    apply_lora_to_mlp: bool
-    apply_lora_to_output: bool
-    rank: int
-    alpha: int
-    use_dora: bool | None = False
-    quantize_base: bool | None = False
-
-
-@json_schema_type
-class QATFinetuningConfig(BaseModel):
-    """Configuration for Quantization-Aware Training (QAT) fine-tuning.
-
-    :param type: Algorithm type identifier, always "QAT"
-    :param quantizer_name: Name of the quantization algorithm to use
-    :param group_size: Size of groups for grouped quantization
-    """
-
-    type: Literal["QAT"] = "QAT"
-    quantizer_name: str
-    group_size: int
-
-
-AlgorithmConfig = Annotated[LoraFinetuningConfig | QATFinetuningConfig, Field(discriminator="type")]
-register_schema(AlgorithmConfig, name="AlgorithmConfig")
-
-
-@json_schema_type
-class PostTrainingJobLogStream(BaseModel):
-    """Stream of logs from a finetuning job.
-
-    :param job_uuid: Unique identifier for the training job
-    :param log_lines: List of log message strings from the training process
-    """
-
-    job_uuid: str
-    log_lines: list[str]
-
-
-@json_schema_type
-class RLHFAlgorithm(Enum):
-    """Available reinforcement learning from human feedback algorithms.
-    :cvar dpo: Direct Preference Optimization algorithm
-    """
-
-    dpo = "dpo"
-
-
-@json_schema_type
-class DPOLossType(Enum):
-    sigmoid = "sigmoid"
-    hinge = "hinge"
-    ipo = "ipo"
-    kto_pair = "kto_pair"
-
-
-@json_schema_type
-class DPOAlignmentConfig(BaseModel):
-    """Configuration for Direct Preference Optimization (DPO) alignment.
-
-    :param beta: Temperature parameter for the DPO loss
-    :param loss_type: The type of loss function to use for DPO
-    """
-
-    beta: float
-    loss_type: DPOLossType = DPOLossType.sigmoid
-
-
-@json_schema_type
-class PostTrainingRLHFRequest(BaseModel):
-    """Request to finetune a model using reinforcement learning from human feedback.
-
-    :param job_uuid: Unique identifier for the training job
-    :param finetuned_model: URL or path to the base model to fine-tune
-    :param dataset_id: Unique identifier for the training dataset
-    :param validation_dataset_id: Unique identifier for the validation dataset
-    :param algorithm: RLHF algorithm to use for training
-    :param algorithm_config: Configuration parameters for the RLHF algorithm
-    :param optimizer_config: Configuration parameters for the optimization algorithm
-    :param training_config: Configuration parameters for the training process
-    :param hyperparam_search_config: Configuration for hyperparameter search
-    :param logger_config: Configuration for training logging
-    """
-
-    job_uuid: str
-
-    finetuned_model: URL
-
-    dataset_id: str
-    validation_dataset_id: str
-
-    algorithm: RLHFAlgorithm
-    algorithm_config: DPOAlignmentConfig
-
-    optimizer_config: OptimizerConfig
-    training_config: TrainingConfig
-
-    # TODO: define these
-    hyperparam_search_config: dict[str, Any]
-    logger_config: dict[str, Any]
-
-
-class PostTrainingJob(BaseModel):
-    job_uuid: str
-
-
-@json_schema_type
-class PostTrainingJobStatusResponse(BaseModel):
-    """Status of a finetuning job.
-
-    :param job_uuid: Unique identifier for the training job
-    :param status: Current status of the training job
-    :param scheduled_at: (Optional) Timestamp when the job was scheduled
-    :param started_at: (Optional) Timestamp when the job execution began
-    :param completed_at: (Optional) Timestamp when the job finished, if completed
-    :param resources_allocated: (Optional) Information about computational resources allocated to the job
-    :param checkpoints: List of model checkpoints created during training
-    """
-
-    job_uuid: str
-    status: JobStatus
-
-    scheduled_at: datetime | None = None
-    started_at: datetime | None = None
-    completed_at: datetime | None = None
-
-    resources_allocated: dict[str, Any] | None = None
-
-    checkpoints: list[Checkpoint] = Field(default_factory=list)
-
-
-class ListPostTrainingJobsResponse(BaseModel):
-    data: list[PostTrainingJob]
-
-
-@json_schema_type
-class PostTrainingJobArtifactsResponse(BaseModel):
-    """Artifacts of a finetuning job.
-
-    :param job_uuid: Unique identifier for the training job
-    :param checkpoints: List of model checkpoints created during training
-    """
-
-    job_uuid: str
-    checkpoints: list[Checkpoint] = Field(default_factory=list)
-
-    # TODO(ashwin): metrics, evals
-
-
-class PostTraining(Protocol):
-    @webmethod(route="/post-training/supervised-fine-tune", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def supervised_fine_tune(
-        self,
-        job_uuid: str,
-        training_config: TrainingConfig,
-        hyperparam_search_config: dict[str, Any],
-        logger_config: dict[str, Any],
-        model: str | None = Field(
-            default=None,
-            description="Model descriptor for training if not in provider config`",
-        ),
-        checkpoint_dir: str | None = None,
-        algorithm_config: AlgorithmConfig | None = None,
-    ) -> PostTrainingJob:
-        """Run supervised fine-tuning of a model.
-
-        :param job_uuid: The UUID of the job to create.
-        :param training_config: The training configuration.
-        :param hyperparam_search_config: The hyperparam search configuration.
-        :param logger_config: The logger configuration.
-        :param model: The model to fine-tune.
-        :param checkpoint_dir: The directory to save checkpoint(s) to.
-        :param algorithm_config: The algorithm configuration.
-        :returns: A PostTrainingJob.
-        """
-        ...
-
-    @webmethod(route="/post-training/preference-optimize", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def preference_optimize(
-        self,
-        job_uuid: str,
-        finetuned_model: str,
-        algorithm_config: DPOAlignmentConfig,
-        training_config: TrainingConfig,
-        hyperparam_search_config: dict[str, Any],
-        logger_config: dict[str, Any],
-    ) -> PostTrainingJob:
-        """Run preference optimization of a model.
-
-        :param job_uuid: The UUID of the job to create.
-        :param finetuned_model: The model to fine-tune.
-        :param algorithm_config: The algorithm configuration.
-        :param training_config: The training configuration.
-        :param hyperparam_search_config: The hyperparam search configuration.
-        :param logger_config: The logger configuration.
-        :returns: A PostTrainingJob.
-        """
-        ...
-
-    @webmethod(route="/post-training/jobs", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
-        """Get all training jobs.
-
-        :returns: A ListPostTrainingJobsResponse.
-        """
-        ...
-
-    @webmethod(route="/post-training/job/status", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse:
-        """Get the status of a training job.
-
-        :param job_uuid: The UUID of the job to get the status of.
-        :returns: A PostTrainingJobStatusResponse.
-        """
-        ...
-
-    @webmethod(route="/post-training/job/cancel", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def cancel_training_job(self, job_uuid: str) -> None:
-        """Cancel a training job.
-
-        :param job_uuid: The UUID of the job to cancel.
-        """
-        ...
-
-    @webmethod(route="/post-training/job/artifacts", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
-        """Get the artifacts of a training job.
-
-        :param job_uuid: The UUID of the job to get the artifacts of.
-        :returns: A PostTrainingJobArtifactsResponse.
-        """
-        ...
--- a/src/llama_stack/apis/prompts/init.py
+++ b/src/llama_stack/apis/prompts/init.py
@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .prompts import ListPromptsResponse, Prompt, Prompts
-
-__all__ = ["Prompt", "Prompts", "ListPromptsResponse"]
--- a/src/llama_stack/apis/prompts/prompts.py
+++ b/src/llama_stack/apis/prompts/prompts.py
@ -1,204 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import re
-import secrets
-from typing import Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field, field_validator, model_validator
-
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-@json_schema_type
-class Prompt(BaseModel):
-    """A prompt resource representing a stored OpenAI Compatible prompt template in Llama Stack.
-
-    :param prompt: The system prompt text with variable placeholders. Variables are only supported when using the Responses API.
-    :param version: Version (integer starting at 1, incremented on save)
-    :param prompt_id: Unique identifier formatted as 'pmpt_<48-digit-hash>'
-    :param variables: List of prompt variable names that can be used in the prompt template
-    :param is_default: Boolean indicating whether this version is the default version for this prompt
-    """
-
-    prompt: str | None = Field(default=None, description="The system prompt with variable placeholders")
-    version: int = Field(description="Version (integer starting at 1, incremented on save)", ge=1)
-    prompt_id: str = Field(description="Unique identifier in format 'pmpt_<48-digit-hash>'")
-    variables: list[str] = Field(
-        default_factory=list, description="List of variable names that can be used in the prompt template"
-    )
-    is_default: bool = Field(
-        default=False, description="Boolean indicating whether this version is the default version"
-    )
-
-    @field_validator("prompt_id")
-    @classmethod
-    def validate_prompt_id(cls, prompt_id: str) -> str:
-        if not isinstance(prompt_id, str):
-            raise TypeError("prompt_id must be a string in format 'pmpt_<48-digit-hash>'")
-
-        if not prompt_id.startswith("pmpt_"):
-            raise ValueError("prompt_id must start with 'pmpt_' prefix")
-
-        hex_part = prompt_id[5:]
-        if len(hex_part) != 48:
-            raise ValueError("prompt_id must be in format 'pmpt_<48-digit-hash>' (48 lowercase hex chars)")
-
-        for char in hex_part:
-            if char not in "0123456789abcdef":
-                raise ValueError("prompt_id hex part must contain only lowercase hex characters [0-9a-f]")
-
-        return prompt_id
-
-    @field_validator("version")
-    @classmethod
-    def validate_version(cls, prompt_version: int) -> int:
-        if prompt_version < 1:
-            raise ValueError("version must be >= 1")
-        return prompt_version
-
-    @model_validator(mode="after")
-    def validate_prompt_variables(self):
-        """Validate that all variables used in the prompt are declared in the variables list."""
-        if not self.prompt:
-            return self
-
-        prompt_variables = set(re.findall(r"{{\s*(\w+)\s*}}", self.prompt))
-        declared_variables = set(self.variables)
-
-        undeclared = prompt_variables - declared_variables
-        if undeclared:
-            raise ValueError(f"Prompt contains undeclared variables: {sorted(undeclared)}")
-
-        return self
-
-    @classmethod
-    def generate_prompt_id(cls) -> str:
-        # Generate 48 hex characters (24 bytes)
-        random_bytes = secrets.token_bytes(24)
-        hex_string = random_bytes.hex()
-        return f"pmpt_{hex_string}"
-
-
-class ListPromptsResponse(BaseModel):
-    """Response model to list prompts."""
-
-    data: list[Prompt]
-
-
-@runtime_checkable
-@trace_protocol
-class Prompts(Protocol):
-    """Prompts
-
-    Protocol for prompt management operations."""
-
-    @webmethod(route="/prompts", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_prompts(self) -> ListPromptsResponse:
-        """List all prompts.
-
-        :returns: A ListPromptsResponse containing all prompts.
-        """
-        ...
-
-    @webmethod(route="/prompts/{prompt_id}/versions", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_prompt_versions(
-        self,
-        prompt_id: str,
-    ) -> ListPromptsResponse:
-        """List prompt versions.
-
-        List all versions of a specific prompt.
-
-        :param prompt_id: The identifier of the prompt to list versions for.
-        :returns: A ListPromptsResponse containing all versions of the prompt.
-        """
-        ...
-
-    @webmethod(route="/prompts/{prompt_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_prompt(
-        self,
-        prompt_id: str,
-        version: int | None = None,
-    ) -> Prompt:
-        """Get prompt.
-
-        Get a prompt by its identifier and optional version.
-
-        :param prompt_id: The identifier of the prompt to get.
-        :param version: The version of the prompt to get (defaults to latest).
-        :returns: A Prompt resource.
-        """
-        ...
-
-    @webmethod(route="/prompts", method="POST", level=LLAMA_STACK_API_V1)
-    async def create_prompt(
-        self,
-        prompt: str,
-        variables: list[str] | None = None,
-    ) -> Prompt:
-        """Create prompt.
-
-        Create a new prompt.
-
-        :param prompt: The prompt text content with variable placeholders.
-        :param variables: List of variable names that can be used in the prompt template.
-        :returns: The created Prompt resource.
-        """
-        ...
-
-    @webmethod(route="/prompts/{prompt_id}", method="PUT", level=LLAMA_STACK_API_V1)
-    async def update_prompt(
-        self,
-        prompt_id: str,
-        prompt: str,
-        version: int,
-        variables: list[str] | None = None,
-        set_as_default: bool = True,
-    ) -> Prompt:
-        """Update prompt.
-
-        Update an existing prompt (increments version).
-
-        :param prompt_id: The identifier of the prompt to update.
-        :param prompt: The updated prompt text content.
-        :param version: The current version of the prompt being updated.
-        :param variables: Updated list of variable names that can be used in the prompt template.
-        :param set_as_default: Set the new version as the default (default=True).
-        :returns: The updated Prompt resource with incremented version.
-        """
-        ...
-
-    @webmethod(route="/prompts/{prompt_id}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def delete_prompt(
-        self,
-        prompt_id: str,
-    ) -> None:
-        """Delete prompt.
-
-        Delete a prompt.
-
-        :param prompt_id: The identifier of the prompt to delete.
-        """
-        ...
-
-    @webmethod(route="/prompts/{prompt_id}/set-default-version", method="PUT", level=LLAMA_STACK_API_V1)
-    async def set_default_version(
-        self,
-        prompt_id: str,
-        version: int,
-    ) -> Prompt:
-        """Set prompt version.
-
-        Set which version of a prompt should be the default in get_prompt (latest).
-
-        :param prompt_id: The identifier of the prompt.
-        :param version: The version to set as default.
-        :returns: The prompt with the specified version now set as default.
-        """
-        ...
--- a/src/llama_stack/apis/providers/init.py
+++ b/src/llama_stack/apis/providers/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .providers import *
--- a/src/llama_stack/apis/providers/providers.py
+++ b/src/llama_stack/apis/providers/providers.py
@ -1,69 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Protocol, runtime_checkable
-
-from pydantic import BaseModel
-
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.providers.datatypes import HealthResponse
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-@json_schema_type
-class ProviderInfo(BaseModel):
-    """Information about a registered provider including its configuration and health status.
-
-    :param api: The API name this provider implements
-    :param provider_id: Unique identifier for the provider
-    :param provider_type: The type of provider implementation
-    :param config: Configuration parameters for the provider
-    :param health: Current health status of the provider
-    """
-
-    api: str
-    provider_id: str
-    provider_type: str
-    config: dict[str, Any]
-    health: HealthResponse
-
-
-class ListProvidersResponse(BaseModel):
-    """Response containing a list of all available providers.
-
-    :param data: List of provider information objects
-    """
-
-    data: list[ProviderInfo]
-
-
-@runtime_checkable
-class Providers(Protocol):
-    """Providers
-
-    Providers API for inspecting, listing, and modifying providers and their configurations.
-    """
-
-    @webmethod(route="/providers", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_providers(self) -> ListProvidersResponse:
-        """List providers.
-
-        List all available providers.
-
-        :returns: A ListProvidersResponse containing information about all providers.
-        """
-        ...
-
-    @webmethod(route="/providers/{provider_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def inspect_provider(self, provider_id: str) -> ProviderInfo:
-        """Get provider.
-
-        Get detailed information about a specific provider.
-
-        :param provider_id: The ID of the provider to inspect.
-        :returns: A ProviderInfo object containing the provider's details.
-        """
-        ...
--- a/src/llama_stack/apis/resource.py
+++ b/src/llama_stack/apis/resource.py
@ -1,37 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from enum import StrEnum
-
-from pydantic import BaseModel, Field
-
-
-class ResourceType(StrEnum):
-    model = "model"
-    shield = "shield"
-    vector_store = "vector_store"
-    dataset = "dataset"
-    scoring_function = "scoring_function"
-    benchmark = "benchmark"
-    tool = "tool"
-    tool_group = "tool_group"
-    prompt = "prompt"
-
-
-class Resource(BaseModel):
-    """Base class for all Llama Stack resources"""
-
-    identifier: str = Field(description="Unique identifier for this resource in llama stack")
-
-    provider_resource_id: str | None = Field(
-        default=None,
-        description="Unique identifier for this resource in the provider",
-    )
-
-    provider_id: str = Field(description="ID of the provider that owns this resource")
-
-    type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_store', etc.)")
--- a/src/llama_stack/apis/safety/init.py
+++ b/src/llama_stack/apis/safety/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .safety import *
--- a/src/llama_stack/apis/safety/safety.py
+++ b/src/llama_stack/apis/safety/safety.py
@ -1,134 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any, Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.inference import OpenAIMessageParam
-from llama_stack.apis.shields import Shield
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-@json_schema_type
-class ModerationObjectResults(BaseModel):
-    """A moderation object.
-    :param flagged: Whether any of the below categories are flagged.
-    :param categories: A list of the categories, and whether they are flagged or not.
-    :param category_applied_input_types: A list of the categories along with the input type(s) that the score applies to.
-    :param category_scores: A list of the categories along with their scores as predicted by model.
-    """
-
-    flagged: bool
-    categories: dict[str, bool] | None = None
-    category_applied_input_types: dict[str, list[str]] | None = None
-    category_scores: dict[str, float] | None = None
-    user_message: str | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class ModerationObject(BaseModel):
-    """A moderation object.
-    :param id: The unique identifier for the moderation request.
-    :param model: The model used to generate the moderation results.
-    :param results: A list of moderation objects
-    """
-
-    id: str
-    model: str
-    results: list[ModerationObjectResults]
-
-
-@json_schema_type
-class ViolationLevel(Enum):
-    """Severity level of a safety violation.
-
-    :cvar INFO: Informational level violation that does not require action
-    :cvar WARN: Warning level violation that suggests caution but allows continuation
-    :cvar ERROR: Error level violation that requires blocking or intervention
-    """
-
-    INFO = "info"
-    WARN = "warn"
-    ERROR = "error"
-
-
-@json_schema_type
-class SafetyViolation(BaseModel):
-    """Details of a safety violation detected by content moderation.
-
-    :param violation_level: Severity level of the violation
-    :param user_message: (Optional) Message to convey to the user about the violation
-    :param metadata: Additional metadata including specific violation codes for debugging and telemetry
-    """
-
-    violation_level: ViolationLevel
-
-    # what message should you convey to the user
-    user_message: str | None = None
-
-    # additional metadata (including specific violation codes) more for
-    # debugging, telemetry
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class RunShieldResponse(BaseModel):
-    """Response from running a safety shield.
-
-    :param violation: (Optional) Safety violation detected by the shield, if any
-    """
-
-    violation: SafetyViolation | None = None
-
-
-class ShieldStore(Protocol):
-    async def get_shield(self, identifier: str) -> Shield: ...
-
-
-@runtime_checkable
-@trace_protocol
-class Safety(Protocol):
-    """Safety
-
-    OpenAI-compatible Moderations API.
-    """
-
-    shield_store: ShieldStore
-
-    @webmethod(route="/safety/run-shield", method="POST", level=LLAMA_STACK_API_V1)
-    async def run_shield(
-        self,
-        shield_id: str,
-        messages: list[OpenAIMessageParam],
-        params: dict[str, Any],
-    ) -> RunShieldResponse:
-        """Run shield.
-
-        Run a shield.
-
-        :param shield_id: The identifier of the shield to run.
-        :param messages: The messages to run the shield on.
-        :param params: The parameters of the shield.
-        :returns: A RunShieldResponse.
-        """
-        ...
-
-    @webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1)
-    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
-        """Create moderation.
-
-        Classifies if text and/or image inputs are potentially harmful.
-        :param input: Input (or inputs) to classify.
-        Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
-        :param model: (Optional) The content moderation model you would like to use.
-        :returns: A moderation object.
-        """
-        ...
--- a/src/llama_stack/apis/scoring/init.py
+++ b/src/llama_stack/apis/scoring/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .scoring import *
--- a/src/llama_stack/apis/scoring/scoring.py
+++ b/src/llama_stack/apis/scoring/scoring.py
@ -1,93 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Protocol, runtime_checkable
-
-from pydantic import BaseModel
-
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-# mapping of metric to value
-ScoringResultRow = dict[str, Any]
-
-
-@json_schema_type
-class ScoringResult(BaseModel):
-    """
-    A scoring result for a single row.
-
-    :param score_rows: The scoring result for each row. Each row is a map of column name to value.
-    :param aggregated_results: Map of metric name to aggregated value
-    """
-
-    score_rows: list[ScoringResultRow]
-    # aggregated metrics to value
-    aggregated_results: dict[str, Any]
-
-
-@json_schema_type
-class ScoreBatchResponse(BaseModel):
-    """Response from batch scoring operations on datasets.
-
-    :param dataset_id: (Optional) The identifier of the dataset that was scored
-    :param results: A map of scoring function name to ScoringResult
-    """
-
-    dataset_id: str | None = None
-    results: dict[str, ScoringResult]
-
-
-@json_schema_type
-class ScoreResponse(BaseModel):
-    """
-    The response from scoring.
-
-    :param results: A map of scoring function name to ScoringResult.
-    """
-
-    # each key in the dict is a scoring function name
-    results: dict[str, ScoringResult]
-
-
-class ScoringFunctionStore(Protocol):
-    def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn: ...
-
-
-@runtime_checkable
-class Scoring(Protocol):
-    scoring_function_store: ScoringFunctionStore
-
-    @webmethod(route="/scoring/score-batch", method="POST", level=LLAMA_STACK_API_V1)
-    async def score_batch(
-        self,
-        dataset_id: str,
-        scoring_functions: dict[str, ScoringFnParams | None],
-        save_results_dataset: bool = False,
-    ) -> ScoreBatchResponse:
-        """Score a batch of rows.
-
-        :param dataset_id: The ID of the dataset to score.
-        :param scoring_functions: The scoring functions to use for the scoring.
-        :param save_results_dataset: Whether to save the results to a dataset.
-        :returns: A ScoreBatchResponse.
-        """
-        ...
-
-    @webmethod(route="/scoring/score", method="POST", level=LLAMA_STACK_API_V1)
-    async def score(
-        self,
-        input_rows: list[dict[str, Any]],
-        scoring_functions: dict[str, ScoringFnParams | None],
-    ) -> ScoreResponse:
-        """Score a list of rows.
-
-        :param input_rows: The rows to score.
-        :param scoring_functions: The scoring functions to use for the scoring.
-        :returns: A ScoreResponse object containing rows and aggregated results.
-        """
-        ...
--- a/src/llama_stack/apis/scoring_functions/init.py
+++ b/src/llama_stack/apis/scoring_functions/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .scoring_functions import *
--- a/src/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/src/llama_stack/apis/scoring_functions/scoring_functions.py
@ -1,208 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# TODO: use enum.StrEnum when we drop support for python 3.10
-from enum import StrEnum
-from typing import (
-    Annotated,
-    Any,
-    Literal,
-    Protocol,
-    runtime_checkable,
-)
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.common.type_system import ParamType
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-# Perhaps more structure can be imposed on these functions. Maybe they could be associated
-# with standard metrics so they can be rolled up?
-@json_schema_type
-class ScoringFnParamsType(StrEnum):
-    """Types of scoring function parameter configurations.
-    :cvar llm_as_judge: Use an LLM model to evaluate and score responses
-    :cvar regex_parser: Use regex patterns to extract and score specific parts of responses
-    :cvar basic: Basic scoring with simple aggregation functions
-    """
-
-    llm_as_judge = "llm_as_judge"
-    regex_parser = "regex_parser"
-    basic = "basic"
-
-
-@json_schema_type
-class AggregationFunctionType(StrEnum):
-    """Types of aggregation functions for scoring results.
-    :cvar average: Calculate the arithmetic mean of scores
-    :cvar weighted_average: Calculate a weighted average of scores
-    :cvar median: Calculate the median value of scores
-    :cvar categorical_count: Count occurrences of categorical values
-    :cvar accuracy: Calculate accuracy as the proportion of correct answers
-    """
-
-    average = "average"
-    weighted_average = "weighted_average"
-    median = "median"
-    categorical_count = "categorical_count"
-    accuracy = "accuracy"
-
-
-@json_schema_type
-class LLMAsJudgeScoringFnParams(BaseModel):
-    """Parameters for LLM-as-judge scoring function configuration.
-    :param type: The type of scoring function parameters, always llm_as_judge
-    :param judge_model: Identifier of the LLM model to use as a judge for scoring
-    :param prompt_template: (Optional) Custom prompt template for the judge model
-    :param judge_score_regexes: Regexes to extract the answer from generated response
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge
-    judge_model: str
-    prompt_template: str | None = None
-    judge_score_regexes: list[str] = Field(
-        description="Regexes to extract the answer from generated response",
-        default_factory=lambda: [],
-    )
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=lambda: [],
-    )
-
-
-@json_schema_type
-class RegexParserScoringFnParams(BaseModel):
-    """Parameters for regex parser scoring function configuration.
-    :param type: The type of scoring function parameters, always regex_parser
-    :param parsing_regexes: Regex to extract the answer from generated response
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser
-    parsing_regexes: list[str] = Field(
-        description="Regex to extract the answer from generated response",
-        default_factory=lambda: [],
-    )
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=lambda: [],
-    )
-
-
-@json_schema_type
-class BasicScoringFnParams(BaseModel):
-    """Parameters for basic scoring function configuration.
-    :param type: The type of scoring function parameters, always basic
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
-    )
-
-
-ScoringFnParams = Annotated[
-    LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams,
-    Field(discriminator="type"),
-]
-register_schema(ScoringFnParams, name="ScoringFnParams")
-
-
-class CommonScoringFnFields(BaseModel):
-    description: str | None = None
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this definition",
-    )
-    return_type: ParamType = Field(
-        description="The return type of the deterministic function",
-    )
-    params: ScoringFnParams | None = Field(
-        description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval",
-        default=None,
-    )
-
-
-@json_schema_type
-class ScoringFn(CommonScoringFnFields, Resource):
-    """A scoring function resource for evaluating model outputs.
-    :param type: The resource type, always scoring_function
-    """
-
-    type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function
-
-    @property
-    def scoring_fn_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_scoring_fn_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class ScoringFnInput(CommonScoringFnFields, BaseModel):
-    scoring_fn_id: str
-    provider_id: str | None = None
-    provider_scoring_fn_id: str | None = None
-
-
-class ListScoringFunctionsResponse(BaseModel):
-    data: list[ScoringFn]
-
-
-@runtime_checkable
-class ScoringFunctions(Protocol):
-    @webmethod(route="/scoring-functions", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
-        """List all scoring functions.
-
-        :returns: A ListScoringFunctionsResponse.
-        """
-        ...
-
-    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn:
-        """Get a scoring function by its ID.
-
-        :param scoring_fn_id: The ID of the scoring function to get.
-        :returns: A ScoringFn.
-        """
-        ...
-
-    @webmethod(route="/scoring-functions", method="POST", level=LLAMA_STACK_API_V1)
-    async def register_scoring_function(
-        self,
-        scoring_fn_id: str,
-        description: str,
-        return_type: ParamType,
-        provider_scoring_fn_id: str | None = None,
-        provider_id: str | None = None,
-        params: ScoringFnParams | None = None,
-    ) -> None:
-        """Register a scoring function.
-
-        :param scoring_fn_id: The ID of the scoring function to register.
-        :param description: The description of the scoring function.
-        :param return_type: The return type of the scoring function.
-        :param provider_scoring_fn_id: The ID of the provider scoring function to use for the scoring function.
-        :param provider_id: The ID of the provider to use for the scoring function.
-        :param params: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-        """
-        ...
-
-    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
-        """Unregister a scoring function.
-
-        :param scoring_fn_id: The ID of the scoring function to unregister.
-        """
-        ...
--- a/src/llama_stack/apis/shields/init.py
+++ b/src/llama_stack/apis/shields/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .shields import *
--- a/src/llama_stack/apis/shields/shields.py
+++ b/src/llama_stack/apis/shields/shields.py
@ -1,94 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel
-
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-class CommonShieldFields(BaseModel):
-    params: dict[str, Any] | None = None
-
-
-@json_schema_type
-class Shield(CommonShieldFields, Resource):
-    """A safety shield resource that can be used to check content.
-
-    :param params: (Optional) Configuration parameters for the shield
-    :param type: The resource type, always shield
-    """
-
-    type: Literal[ResourceType.shield] = ResourceType.shield
-
-    @property
-    def shield_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_shield_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class ShieldInput(CommonShieldFields):
-    shield_id: str
-    provider_id: str | None = None
-    provider_shield_id: str | None = None
-
-
-class ListShieldsResponse(BaseModel):
-    data: list[Shield]
-
-
-@runtime_checkable
-@trace_protocol
-class Shields(Protocol):
-    @webmethod(route="/shields", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_shields(self) -> ListShieldsResponse:
-        """List all shields.
-
-        :returns: A ListShieldsResponse.
-        """
-        ...
-
-    @webmethod(route="/shields/{identifier:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_shield(self, identifier: str) -> Shield:
-        """Get a shield by its identifier.
-
-        :param identifier: The identifier of the shield to get.
-        :returns: A Shield.
-        """
-        ...
-
-    @webmethod(route="/shields", method="POST", level=LLAMA_STACK_API_V1)
-    async def register_shield(
-        self,
-        shield_id: str,
-        provider_shield_id: str | None = None,
-        provider_id: str | None = None,
-        params: dict[str, Any] | None = None,
-    ) -> Shield:
-        """Register a shield.
-
-        :param shield_id: The identifier of the shield to register.
-        :param provider_shield_id: The identifier of the shield in the provider.
-        :param provider_id: The identifier of the provider.
-        :param params: The parameters of the shield.
-        :returns: A Shield.
-        """
-        ...
-
-    @webmethod(route="/shields/{identifier:path}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def unregister_shield(self, identifier: str) -> None:
-        """Unregister a shield.
-
-        :param identifier: The identifier of the shield to unregister.
-        """
-        ...
--- a/src/llama_stack/apis/tools/init.py
+++ b/src/llama_stack/apis/tools/init.py
@ -1,8 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .rag_tool import *
-from .tools import *
--- a/src/llama_stack/apis/tools/rag_tool.py
+++ b/src/llama_stack/apis/tools/rag_tool.py
@ -1,218 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum, StrEnum
-from typing import Annotated, Any, Literal, Protocol
-
-from pydantic import BaseModel, Field, field_validator
-from typing_extensions import runtime_checkable
-
-from llama_stack.apis.common.content_types import URL, InterleavedContent
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-@json_schema_type
-class RRFRanker(BaseModel):
-    """
-    Reciprocal Rank Fusion (RRF) ranker configuration.
-
-    :param type: The type of ranker, always "rrf"
-    :param impact_factor: The impact factor for RRF scoring. Higher values give more weight to higher-ranked results.
-                         Must be greater than 0
-    """
-
-    type: Literal["rrf"] = "rrf"
-    impact_factor: float = Field(default=60.0, gt=0.0)  # default of 60 for optimal performance
-
-
-@json_schema_type
-class WeightedRanker(BaseModel):
-    """
-    Weighted ranker configuration that combines vector and keyword scores.
-
-    :param type: The type of ranker, always "weighted"
-    :param alpha: Weight factor between 0 and 1.
-                 0 means only use keyword scores,
-                 1 means only use vector scores,
-                 values in between blend both scores.
-    """
-
-    type: Literal["weighted"] = "weighted"
-    alpha: float = Field(
-        default=0.5,
-        ge=0.0,
-        le=1.0,
-        description="Weight factor between 0 and 1. 0 means only keyword scores, 1 means only vector scores.",
-    )
-
-
-Ranker = Annotated[
-    RRFRanker | WeightedRanker,
-    Field(discriminator="type"),
-]
-register_schema(Ranker, name="Ranker")
-
-
-@json_schema_type
-class RAGDocument(BaseModel):
-    """
-    A document to be used for document ingestion in the RAG Tool.
-
-    :param document_id: The unique identifier for the document.
-    :param content: The content of the document.
-    :param mime_type: The MIME type of the document.
-    :param metadata: Additional metadata for the document.
-    """
-
-    document_id: str
-    content: InterleavedContent | URL
-    mime_type: str | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class RAGQueryResult(BaseModel):
-    """Result of a RAG query containing retrieved content and metadata.
-
-    :param content: (Optional) The retrieved content from the query
-    :param metadata: Additional metadata about the query result
-    """
-
-    content: InterleavedContent | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class RAGQueryGenerator(Enum):
-    """Types of query generators for RAG systems.
-
-    :cvar default: Default query generator using simple text processing
-    :cvar llm: LLM-based query generator for enhanced query understanding
-    :cvar custom: Custom query generator implementation
-    """
-
-    default = "default"
-    llm = "llm"
-    custom = "custom"
-
-
-@json_schema_type
-class RAGSearchMode(StrEnum):
-    """
-    Search modes for RAG query retrieval:
-    - VECTOR: Uses vector similarity search for semantic matching
-    - KEYWORD: Uses keyword-based search for exact matching
-    - HYBRID: Combines both vector and keyword search for better results
-    """
-
-    VECTOR = "vector"
-    KEYWORD = "keyword"
-    HYBRID = "hybrid"
-
-
-@json_schema_type
-class DefaultRAGQueryGeneratorConfig(BaseModel):
-    """Configuration for the default RAG query generator.
-
-    :param type: Type of query generator, always 'default'
-    :param separator: String separator used to join query terms
-    """
-
-    type: Literal["default"] = "default"
-    separator: str = " "
-
-
-@json_schema_type
-class LLMRAGQueryGeneratorConfig(BaseModel):
-    """Configuration for the LLM-based RAG query generator.
-
-    :param type: Type of query generator, always 'llm'
-    :param model: Name of the language model to use for query generation
-    :param template: Template string for formatting the query generation prompt
-    """
-
-    type: Literal["llm"] = "llm"
-    model: str
-    template: str
-
-
-RAGQueryGeneratorConfig = Annotated[
-    DefaultRAGQueryGeneratorConfig | LLMRAGQueryGeneratorConfig,
-    Field(discriminator="type"),
-]
-register_schema(RAGQueryGeneratorConfig, name="RAGQueryGeneratorConfig")
-
-
-@json_schema_type
-class RAGQueryConfig(BaseModel):
-    """
-    Configuration for the RAG query generation.
-
-    :param query_generator_config: Configuration for the query generator.
-    :param max_tokens_in_context: Maximum number of tokens in the context.
-    :param max_chunks: Maximum number of chunks to retrieve.
-    :param chunk_template: Template for formatting each retrieved chunk in the context.
-        Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict).
-        Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n"
-    :param mode: Search mode for retrieval—either "vector", "keyword", or "hybrid". Default "vector".
-    :param ranker: Configuration for the ranker to use in hybrid search. Defaults to RRF ranker.
-    """
-
-    # This config defines how a query is generated using the messages
-    # for memory bank retrieval.
-    query_generator_config: RAGQueryGeneratorConfig = Field(default=DefaultRAGQueryGeneratorConfig())
-    max_tokens_in_context: int = 4096
-    max_chunks: int = 5
-    chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
-    mode: RAGSearchMode | None = RAGSearchMode.VECTOR
-    ranker: Ranker | None = Field(default=None)  # Only used for hybrid mode
-
-    @field_validator("chunk_template")
-    def validate_chunk_template(cls, v: str) -> str:
-        if "{chunk.content}" not in v:
-            raise ValueError("chunk_template must contain {chunk.content}")
-        if "{index}" not in v:
-            raise ValueError("chunk_template must contain {index}")
-        if len(v) == 0:
-            raise ValueError("chunk_template must not be empty")
-        return v
-
-
-@runtime_checkable
-@trace_protocol
-class RAGToolRuntime(Protocol):
-    @webmethod(route="/tool-runtime/rag-tool/insert", method="POST", level=LLAMA_STACK_API_V1)
-    async def insert(
-        self,
-        documents: list[RAGDocument],
-        vector_store_id: str,
-        chunk_size_in_tokens: int = 512,
-    ) -> None:
-        """Index documents so they can be used by the RAG system.
-
-        :param documents: List of documents to index in the RAG system
-        :param vector_store_id: ID of the vector database to store the document embeddings
-        :param chunk_size_in_tokens: (Optional) Size in tokens for document chunking during indexing
-        """
-        ...
-
-    @webmethod(route="/tool-runtime/rag-tool/query", method="POST", level=LLAMA_STACK_API_V1)
-    async def query(
-        self,
-        content: InterleavedContent,
-        vector_store_ids: list[str],
-        query_config: RAGQueryConfig | None = None,
-    ) -> RAGQueryResult:
-        """Query the RAG system for context; typically invoked by the agent.
-
-        :param content: The query content to search for in the indexed documents
-        :param vector_store_ids: List of vector database IDs to search within
-        :param query_config: (Optional) Configuration parameters for the query operation
-        :returns: RAGQueryResult containing the retrieved content and metadata
-        """
-        ...
--- a/src/llama_stack/apis/tools/tools.py
+++ b/src/llama_stack/apis/tools/tools.py
@ -1,221 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any, Literal, Protocol
-
-from pydantic import BaseModel
-from typing_extensions import runtime_checkable
-
-from llama_stack.apis.common.content_types import URL, InterleavedContent
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-from .rag_tool import RAGToolRuntime
-
-
-@json_schema_type
-class ToolDef(BaseModel):
-    """Tool definition used in runtime contexts.
-
-    :param name: Name of the tool
-    :param description: (Optional) Human-readable description of what the tool does
-    :param input_schema: (Optional) JSON Schema for tool inputs (MCP inputSchema)
-    :param output_schema: (Optional) JSON Schema for tool outputs (MCP outputSchema)
-    :param metadata: (Optional) Additional metadata about the tool
-    :param toolgroup_id: (Optional) ID of the tool group this tool belongs to
-    """
-
-    toolgroup_id: str | None = None
-    name: str
-    description: str | None = None
-    input_schema: dict[str, Any] | None = None
-    output_schema: dict[str, Any] | None = None
-    metadata: dict[str, Any] | None = None
-
-
-@json_schema_type
-class ToolGroupInput(BaseModel):
-    """Input data for registering a tool group.
-
-    :param toolgroup_id: Unique identifier for the tool group
-    :param provider_id: ID of the provider that will handle this tool group
-    :param args: (Optional) Additional arguments to pass to the provider
-    :param mcp_endpoint: (Optional) Model Context Protocol endpoint for remote tools
-    """
-
-    toolgroup_id: str
-    provider_id: str
-    args: dict[str, Any] | None = None
-    mcp_endpoint: URL | None = None
-
-
-@json_schema_type
-class ToolGroup(Resource):
-    """A group of related tools managed together.
-
-    :param type: Type of resource, always 'tool_group'
-    :param mcp_endpoint: (Optional) Model Context Protocol endpoint for remote tools
-    :param args: (Optional) Additional arguments for the tool group
-    """
-
-    type: Literal[ResourceType.tool_group] = ResourceType.tool_group
-    mcp_endpoint: URL | None = None
-    args: dict[str, Any] | None = None
-
-
-@json_schema_type
-class ToolInvocationResult(BaseModel):
-    """Result of a tool invocation.
-
-    :param content: (Optional) The output content from the tool execution
-    :param error_message: (Optional) Error message if the tool execution failed
-    :param error_code: (Optional) Numeric error code if the tool execution failed
-    :param metadata: (Optional) Additional metadata about the tool execution
-    """
-
-    content: InterleavedContent | None = None
-    error_message: str | None = None
-    error_code: int | None = None
-    metadata: dict[str, Any] | None = None
-
-
-class ToolStore(Protocol):
-    async def get_tool(self, tool_name: str) -> ToolDef: ...
-    async def get_tool_group(self, toolgroup_id: str) -> ToolGroup: ...
-
-
-class ListToolGroupsResponse(BaseModel):
-    """Response containing a list of tool groups.
-
-    :param data: List of tool groups
-    """
-
-    data: list[ToolGroup]
-
-
-class ListToolDefsResponse(BaseModel):
-    """Response containing a list of tool definitions.
-
-    :param data: List of tool definitions
-    """
-
-    data: list[ToolDef]
-
-
-@runtime_checkable
-@trace_protocol
-class ToolGroups(Protocol):
-    @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1)
-    async def register_tool_group(
-        self,
-        toolgroup_id: str,
-        provider_id: str,
-        mcp_endpoint: URL | None = None,
-        args: dict[str, Any] | None = None,
-    ) -> None:
-        """Register a tool group.
-
-        :param toolgroup_id: The ID of the tool group to register.
-        :param provider_id: The ID of the provider to use for the tool group.
-        :param mcp_endpoint: The MCP endpoint to use for the tool group.
-        :param args: A dictionary of arguments to pass to the tool group.
-        """
-        ...
-
-    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_tool_group(
-        self,
-        toolgroup_id: str,
-    ) -> ToolGroup:
-        """Get a tool group by its ID.
-
-        :param toolgroup_id: The ID of the tool group to get.
-        :returns: A ToolGroup.
-        """
-        ...
-
-    @webmethod(route="/toolgroups", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_tool_groups(self) -> ListToolGroupsResponse:
-        """List tool groups with optional provider.
-
-        :returns: A ListToolGroupsResponse.
-        """
-        ...
-
-    @webmethod(route="/tools", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolDefsResponse:
-        """List tools with optional tool group.
-
-        :param toolgroup_id: The ID of the tool group to list tools for.
-        :returns: A ListToolDefsResponse.
-        """
-        ...
-
-    @webmethod(route="/tools/{tool_name:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_tool(
-        self,
-        tool_name: str,
-    ) -> ToolDef:
-        """Get a tool by its name.
-
-        :param tool_name: The name of the tool to get.
-        :returns: A ToolDef.
-        """
-        ...
-
-    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def unregister_toolgroup(
-        self,
-        toolgroup_id: str,
-    ) -> None:
-        """Unregister a tool group.
-
-        :param toolgroup_id: The ID of the tool group to unregister.
-        """
-        ...
-
-
-class SpecialToolGroup(Enum):
-    """Special tool groups with predefined functionality.
-
-    :cvar rag_tool: Retrieval-Augmented Generation tool group for document search and retrieval
-    """
-
-    rag_tool = "rag_tool"
-
-
-@runtime_checkable
-@trace_protocol
-class ToolRuntime(Protocol):
-    tool_store: ToolStore | None = None
-
-    rag_tool: RAGToolRuntime | None = None
-
-    # TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed.
-    @webmethod(route="/tool-runtime/list-tools", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_runtime_tools(
-        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
-    ) -> ListToolDefsResponse:
-        """List all tools in the runtime.
-
-        :param tool_group_id: The ID of the tool group to list tools for.
-        :param mcp_endpoint: The MCP endpoint to use for the tool group.
-        :returns: A ListToolDefsResponse.
-        """
-        ...
-
-    @webmethod(route="/tool-runtime/invoke", method="POST", level=LLAMA_STACK_API_V1)
-    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
-        """Run a tool with the given arguments.
-
-        :param tool_name: The name of the tool to invoke.
-        :param kwargs: A dictionary of arguments to pass to the tool.
-        :returns: A ToolInvocationResult.
-        """
-        ...
--- a/src/llama_stack/apis/vector_io/init.py
+++ b/src/llama_stack/apis/vector_io/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .vector_io import *
--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -1,862 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Annotated, Any, Literal, Protocol, runtime_checkable
-
-from fastapi import Body
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.inference import InterleavedContent
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, webmethod
-from llama_stack.strong_typing.schema import register_schema
-
-
-@json_schema_type
-class ChunkMetadata(BaseModel):
-    """
-    `ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that
-        will not be used in the context during inference, but is required for backend functionality. The `ChunkMetadata`
-        is set during chunk creation in `MemoryToolRuntimeImpl().insert()`and is not expected to change after.
-        Use `Chunk.metadata` for metadata that will be used in the context during inference.
-    :param chunk_id: The ID of the chunk. If not set, it will be generated based on the document ID and content.
-    :param document_id: The ID of the document this chunk belongs to.
-    :param source: The source of the content, such as a URL, file path, or other identifier.
-    :param created_timestamp: An optional timestamp indicating when the chunk was created.
-    :param updated_timestamp: An optional timestamp indicating when the chunk was last updated.
-    :param chunk_window: The window of the chunk, which can be used to group related chunks together.
-    :param chunk_tokenizer: The tokenizer used to create the chunk. Default is Tiktoken.
-    :param chunk_embedding_model: The embedding model used to create the chunk's embedding.
-    :param chunk_embedding_dimension: The dimension of the embedding vector for the chunk.
-    :param content_token_count: The number of tokens in the content of the chunk.
-    :param metadata_token_count: The number of tokens in the metadata of the chunk.
-    """
-
-    chunk_id: str | None = None
-    document_id: str | None = None
-    source: str | None = None
-    created_timestamp: int | None = None
-    updated_timestamp: int | None = None
-    chunk_window: str | None = None
-    chunk_tokenizer: str | None = None
-    chunk_embedding_model: str | None = None
-    chunk_embedding_dimension: int | None = None
-    content_token_count: int | None = None
-    metadata_token_count: int | None = None
-
-
-@json_schema_type
-class Chunk(BaseModel):
-    """
-    A chunk of content that can be inserted into a vector database.
-    :param content: The content of the chunk, which can be interleaved text, images, or other types.
-    :param chunk_id: Unique identifier for the chunk. Must be provided explicitly.
-    :param metadata: Metadata associated with the chunk that will be used in the model context during inference.
-    :param embedding: Optional embedding for the chunk. If not provided, it will be computed later.
-    :param chunk_metadata: Metadata for the chunk that will NOT be used in the context during inference.
-        The `chunk_metadata` is required backend functionality.
-    """
-
-    content: InterleavedContent
-    chunk_id: str
-    metadata: dict[str, Any] = Field(default_factory=dict)
-    embedding: list[float] | None = None
-    chunk_metadata: ChunkMetadata | None = None
-
-    @property
-    def document_id(self) -> str | None:
-        """Returns the document_id from either metadata or chunk_metadata, with metadata taking precedence."""
-        # Check metadata first (takes precedence)
-        doc_id = self.metadata.get("document_id")
-        if doc_id is not None:
-            if not isinstance(doc_id, str):
-                raise TypeError(f"metadata['document_id'] must be a string, got {type(doc_id).__name__}: {doc_id!r}")
-            return doc_id
-
-        # Fall back to chunk_metadata if available (Pydantic ensures type safety)
-        if self.chunk_metadata is not None:
-            return self.chunk_metadata.document_id
-
-        return None
-
-
-@json_schema_type
-class QueryChunksResponse(BaseModel):
-    """Response from querying chunks in a vector database.
-
-    :param chunks: List of content chunks returned from the query
-    :param scores: Relevance scores corresponding to each returned chunk
-    """
-
-    chunks: list[Chunk]
-    scores: list[float]
-
-
-@json_schema_type
-class VectorStoreFileCounts(BaseModel):
-    """File processing status counts for a vector store.
-
-    :param completed: Number of files that have been successfully processed
-    :param cancelled: Number of files that had their processing cancelled
-    :param failed: Number of files that failed to process
-    :param in_progress: Number of files currently being processed
-    :param total: Total number of files in the vector store
-    """
-
-    completed: int
-    cancelled: int
-    failed: int
-    in_progress: int
-    total: int
-
-
-# TODO: rename this as OpenAIVectorStore
-@json_schema_type
-class VectorStoreObject(BaseModel):
-    """OpenAI Vector Store object.
-
-    :param id: Unique identifier for the vector store
-    :param object: Object type identifier, always "vector_store"
-    :param created_at: Timestamp when the vector store was created
-    :param name: (Optional) Name of the vector store
-    :param usage_bytes: Storage space used by the vector store in bytes
-    :param file_counts: File processing status counts for the vector store
-    :param status: Current status of the vector store
-    :param expires_after: (Optional) Expiration policy for the vector store
-    :param expires_at: (Optional) Timestamp when the vector store will expire
-    :param last_active_at: (Optional) Timestamp of last activity on the vector store
-    :param metadata: Set of key-value pairs that can be attached to the vector store
-    """
-
-    id: str
-    object: str = "vector_store"
-    created_at: int
-    name: str | None = None
-    usage_bytes: int = 0
-    file_counts: VectorStoreFileCounts
-    status: str = "completed"
-    expires_after: dict[str, Any] | None = None
-    expires_at: int | None = None
-    last_active_at: int | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class VectorStoreCreateRequest(BaseModel):
-    """Request to create a vector store.
-
-    :param name: (Optional) Name for the vector store
-    :param file_ids: List of file IDs to include in the vector store
-    :param expires_after: (Optional) Expiration policy for the vector store
-    :param chunking_strategy: (Optional) Strategy for splitting files into chunks
-    :param metadata: Set of key-value pairs that can be attached to the vector store
-    """
-
-    name: str | None = None
-    file_ids: list[str] = Field(default_factory=list)
-    expires_after: dict[str, Any] | None = None
-    chunking_strategy: dict[str, Any] | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class VectorStoreModifyRequest(BaseModel):
-    """Request to modify a vector store.
-
-    :param name: (Optional) Updated name for the vector store
-    :param expires_after: (Optional) Updated expiration policy for the vector store
-    :param metadata: (Optional) Updated set of key-value pairs for the vector store
-    """
-
-    name: str | None = None
-    expires_after: dict[str, Any] | None = None
-    metadata: dict[str, Any] | None = None
-
-
-@json_schema_type
-class VectorStoreListResponse(BaseModel):
-    """Response from listing vector stores.
-
-    :param object: Object type identifier, always "list"
-    :param data: List of vector store objects
-    :param first_id: (Optional) ID of the first vector store in the list for pagination
-    :param last_id: (Optional) ID of the last vector store in the list for pagination
-    :param has_more: Whether there are more vector stores available beyond this page
-    """
-
-    object: str = "list"
-    data: list[VectorStoreObject]
-    first_id: str | None = None
-    last_id: str | None = None
-    has_more: bool = False
-
-
-@json_schema_type
-class VectorStoreSearchRequest(BaseModel):
-    """Request to search a vector store.
-
-    :param query: Search query as a string or list of strings
-    :param filters: (Optional) Filters based on file attributes to narrow search results
-    :param max_num_results: Maximum number of results to return, defaults to 10
-    :param ranking_options: (Optional) Options for ranking and filtering search results
-    :param rewrite_query: Whether to rewrite the query for better vector search performance
-    """
-
-    query: str | list[str]
-    filters: dict[str, Any] | None = None
-    max_num_results: int = 10
-    ranking_options: dict[str, Any] | None = None
-    rewrite_query: bool = False
-
-
-@json_schema_type
-class VectorStoreContent(BaseModel):
-    """Content item from a vector store file or search result.
-
-    :param type: Content type, currently only "text" is supported
-    :param text: The actual text content
-    """
-
-    type: Literal["text"]
-    text: str
-
-
-@json_schema_type
-class VectorStoreSearchResponse(BaseModel):
-    """Response from searching a vector store.
-
-    :param file_id: Unique identifier of the file containing the result
-    :param filename: Name of the file containing the result
-    :param score: Relevance score for this search result
-    :param attributes: (Optional) Key-value attributes associated with the file
-    :param content: List of content items matching the search query
-    """
-
-    file_id: str
-    filename: str
-    score: float
-    attributes: dict[str, str | float | bool] | None = None
-    content: list[VectorStoreContent]
-
-
-@json_schema_type
-class VectorStoreSearchResponsePage(BaseModel):
-    """Paginated response from searching a vector store.
-
-    :param object: Object type identifier for the search results page
-    :param search_query: The original search query that was executed
-    :param data: List of search result objects
-    :param has_more: Whether there are more results available beyond this page
-    :param next_page: (Optional) Token for retrieving the next page of results
-    """
-
-    object: str = "vector_store.search_results.page"
-    search_query: str
-    data: list[VectorStoreSearchResponse]
-    has_more: bool = False
-    next_page: str | None = None
-
-
-@json_schema_type
-class VectorStoreDeleteResponse(BaseModel):
-    """Response from deleting a vector store.
-
-    :param id: Unique identifier of the deleted vector store
-    :param object: Object type identifier for the deletion response
-    :param deleted: Whether the deletion operation was successful
-    """
-
-    id: str
-    object: str = "vector_store.deleted"
-    deleted: bool = True
-
-
-@json_schema_type
-class VectorStoreChunkingStrategyAuto(BaseModel):
-    """Automatic chunking strategy for vector store files.
-
-    :param type: Strategy type, always "auto" for automatic chunking
-    """
-
-    type: Literal["auto"] = "auto"
-
-
-@json_schema_type
-class VectorStoreChunkingStrategyStaticConfig(BaseModel):
-    """Configuration for static chunking strategy.
-
-    :param chunk_overlap_tokens: Number of tokens to overlap between adjacent chunks
-    :param max_chunk_size_tokens: Maximum number of tokens per chunk, must be between 100 and 4096
-    """
-
-    chunk_overlap_tokens: int = 400
-    max_chunk_size_tokens: int = Field(800, ge=100, le=4096)
-
-
-@json_schema_type
-class VectorStoreChunkingStrategyStatic(BaseModel):
-    """Static chunking strategy with configurable parameters.
-
-    :param type: Strategy type, always "static" for static chunking
-    :param static: Configuration parameters for the static chunking strategy
-    """
-
-    type: Literal["static"] = "static"
-    static: VectorStoreChunkingStrategyStaticConfig
-
-
-VectorStoreChunkingStrategy = Annotated[
-    VectorStoreChunkingStrategyAuto | VectorStoreChunkingStrategyStatic,
-    Field(discriminator="type"),
-]
-register_schema(VectorStoreChunkingStrategy, name="VectorStoreChunkingStrategy")
-
-
-class SearchRankingOptions(BaseModel):
-    """Options for ranking and filtering search results.
-
-    :param ranker: (Optional) Name of the ranking algorithm to use
-    :param score_threshold: (Optional) Minimum relevance score threshold for results
-    """
-
-    ranker: str | None = None
-    # NOTE: OpenAI File Search Tool requires threshold to be between 0 and 1, however
-    # we don't guarantee that the score is between 0 and 1, so will leave this unconstrained
-    # and let the provider handle it
-    score_threshold: float | None = Field(default=0.0)
-
-
-@json_schema_type
-class VectorStoreFileLastError(BaseModel):
-    """Error information for failed vector store file processing.
-
-    :param code: Error code indicating the type of failure
-    :param message: Human-readable error message describing the failure
-    """
-
-    code: Literal["server_error"] | Literal["rate_limit_exceeded"]
-    message: str
-
-
-VectorStoreFileStatus = Literal["completed"] | Literal["in_progress"] | Literal["cancelled"] | Literal["failed"]
-register_schema(VectorStoreFileStatus, name="VectorStoreFileStatus")
-
-
-@json_schema_type
-class VectorStoreFileObject(BaseModel):
-    """OpenAI Vector Store File object.
-
-    :param id: Unique identifier for the file
-    :param object: Object type identifier, always "vector_store.file"
-    :param attributes: Key-value attributes associated with the file
-    :param chunking_strategy: Strategy used for splitting the file into chunks
-    :param created_at: Timestamp when the file was added to the vector store
-    :param last_error: (Optional) Error information if file processing failed
-    :param status: Current processing status of the file
-    :param usage_bytes: Storage space used by this file in bytes
-    :param vector_store_id: ID of the vector store containing this file
-    """
-
-    id: str
-    object: str = "vector_store.file"
-    attributes: dict[str, Any] = Field(default_factory=dict)
-    chunking_strategy: VectorStoreChunkingStrategy
-    created_at: int
-    last_error: VectorStoreFileLastError | None = None
-    status: VectorStoreFileStatus
-    usage_bytes: int = 0
-    vector_store_id: str
-
-
-@json_schema_type
-class VectorStoreListFilesResponse(BaseModel):
-    """Response from listing files in a vector store.
-
-    :param object: Object type identifier, always "list"
-    :param data: List of vector store file objects
-    :param first_id: (Optional) ID of the first file in the list for pagination
-    :param last_id: (Optional) ID of the last file in the list for pagination
-    :param has_more: Whether there are more files available beyond this page
-    """
-
-    object: str = "list"
-    data: list[VectorStoreFileObject]
-    first_id: str | None = None
-    last_id: str | None = None
-    has_more: bool = False
-
-
-@json_schema_type
-class VectorStoreFileContentsResponse(BaseModel):
-    """Response from retrieving the contents of a vector store file.
-
-    :param file_id: Unique identifier for the file
-    :param filename: Name of the file
-    :param attributes: Key-value attributes associated with the file
-    :param content: List of content items from the file
-    """
-
-    file_id: str
-    filename: str
-    attributes: dict[str, Any]
-    content: list[VectorStoreContent]
-
-
-@json_schema_type
-class VectorStoreFileDeleteResponse(BaseModel):
-    """Response from deleting a vector store file.
-
-    :param id: Unique identifier of the deleted file
-    :param object: Object type identifier for the deletion response
-    :param deleted: Whether the deletion operation was successful
-    """
-
-    id: str
-    object: str = "vector_store.file.deleted"
-    deleted: bool = True
-
-
-@json_schema_type
-class VectorStoreFileBatchObject(BaseModel):
-    """OpenAI Vector Store File Batch object.
-
-    :param id: Unique identifier for the file batch
-    :param object: Object type identifier, always "vector_store.file_batch"
-    :param created_at: Timestamp when the file batch was created
-    :param vector_store_id: ID of the vector store containing the file batch
-    :param status: Current processing status of the file batch
-    :param file_counts: File processing status counts for the batch
-    """
-
-    id: str
-    object: str = "vector_store.file_batch"
-    created_at: int
-    vector_store_id: str
-    status: VectorStoreFileStatus
-    file_counts: VectorStoreFileCounts
-
-
-@json_schema_type
-class VectorStoreFilesListInBatchResponse(BaseModel):
-    """Response from listing files in a vector store file batch.
-
-    :param object: Object type identifier, always "list"
-    :param data: List of vector store file objects in the batch
-    :param first_id: (Optional) ID of the first file in the list for pagination
-    :param last_id: (Optional) ID of the last file in the list for pagination
-    :param has_more: Whether there are more files available beyond this page
-    """
-
-    object: str = "list"
-    data: list[VectorStoreFileObject]
-    first_id: str | None = None
-    last_id: str | None = None
-    has_more: bool = False
-
-
-# extra_body can be accessed via .model_extra
-@json_schema_type
-class OpenAICreateVectorStoreRequestWithExtraBody(BaseModel, extra="allow"):
-    """Request to create a vector store with extra_body support.
-
-    :param name: (Optional) A name for the vector store
-    :param file_ids: List of file IDs to include in the vector store
-    :param expires_after: (Optional) Expiration policy for the vector store
-    :param chunking_strategy: (Optional) Strategy for splitting files into chunks
-    :param metadata: Set of key-value pairs that can be attached to the vector store
-    """
-
-    name: str | None = None
-    file_ids: list[str] | None = None
-    expires_after: dict[str, Any] | None = None
-    chunking_strategy: dict[str, Any] | None = None
-    metadata: dict[str, Any] | None = None
-
-
-# extra_body can be accessed via .model_extra
-@json_schema_type
-class OpenAICreateVectorStoreFileBatchRequestWithExtraBody(BaseModel, extra="allow"):
-    """Request to create a vector store file batch with extra_body support.
-
-    :param file_ids: A list of File IDs that the vector store should use
-    :param attributes: (Optional) Key-value attributes to store with the files
-    :param chunking_strategy: (Optional) The chunking strategy used to chunk the file(s). Defaults to auto
-    """
-
-    file_ids: list[str]
-    attributes: dict[str, Any] | None = None
-    chunking_strategy: VectorStoreChunkingStrategy | None = None
-
-
-class VectorStoreTable(Protocol):
-    def get_vector_store(self, vector_store_id: str) -> VectorStore | None: ...
-
-
-@runtime_checkable
-@trace_protocol
-class VectorIO(Protocol):
-    vector_store_table: VectorStoreTable | None = None
-
-    # this will just block now until chunks are inserted, but it should
-    # probably return a Job instance which can be polled for completion
-    # TODO: rename vector_store_id to vector_store_id once Stainless is working
-    @webmethod(route="/vector-io/insert", method="POST", level=LLAMA_STACK_API_V1)
-    async def insert_chunks(
-        self,
-        vector_store_id: str,
-        chunks: list[Chunk],
-        ttl_seconds: int | None = None,
-    ) -> None:
-        """Insert chunks into a vector database.
-
-        :param vector_store_id: The identifier of the vector database to insert the chunks into.
-        :param chunks: The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types.
-            `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional.
-            If `metadata` is provided, you configure how Llama Stack formats the chunk during generation.
-            If `embedding` is not provided, it will be computed later.
-        :param ttl_seconds: The time to live of the chunks.
-        """
-        ...
-
-    # TODO: rename vector_store_id to vector_store_id once Stainless is working
-    @webmethod(route="/vector-io/query", method="POST", level=LLAMA_STACK_API_V1)
-    async def query_chunks(
-        self,
-        vector_store_id: str,
-        query: InterleavedContent,
-        params: dict[str, Any] | None = None,
-    ) -> QueryChunksResponse:
-        """Query chunks from a vector database.
-
-        :param vector_store_id: The identifier of the vector database to query.
-        :param query: The query to search for.
-        :param params: The parameters of the query.
-        :returns: A QueryChunksResponse.
-        """
-        ...
-
-    # OpenAI Vector Stores API endpoints
-    @webmethod(route="/vector_stores", method="POST", level=LLAMA_STACK_API_V1)
-    async def openai_create_vector_store(
-        self,
-        params: Annotated[OpenAICreateVectorStoreRequestWithExtraBody, Body(...)],
-    ) -> VectorStoreObject:
-        """Creates a vector store.
-
-        Generate an OpenAI-compatible vector store with the given parameters.
-        :returns: A VectorStoreObject representing the created vector store.
-        """
-        ...
-
-    @webmethod(route="/vector_stores", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_list_vector_stores(
-        self,
-        limit: int | None = 20,
-        order: str | None = "desc",
-        after: str | None = None,
-        before: str | None = None,
-    ) -> VectorStoreListResponse:
-        """Returns a list of vector stores.
-
-        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
-        :param order: Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.
-        :param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list.
-        :param before: A cursor for use in pagination. `before` is an object ID that defines your place in the list.
-        :returns: A VectorStoreListResponse containing the list of vector stores.
-        """
-        ...
-
-    @webmethod(route="/vector_stores/{vector_store_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_retrieve_vector_store(
-        self,
-        vector_store_id: str,
-    ) -> VectorStoreObject:
-        """Retrieves a vector store.
-
-        :param vector_store_id: The ID of the vector store to retrieve.
-        :returns: A VectorStoreObject representing the vector store.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_update_vector_store(
-        self,
-        vector_store_id: str,
-        name: str | None = None,
-        expires_after: dict[str, Any] | None = None,
-        metadata: dict[str, Any] | None = None,
-    ) -> VectorStoreObject:
-        """Updates a vector store.
-
-        :param vector_store_id: The ID of the vector store to update.
-        :param name: The name of the vector store.
-        :param expires_after: The expiration policy for a vector store.
-        :param metadata: Set of 16 key-value pairs that can be attached to an object.
-        :returns: A VectorStoreObject representing the updated vector store.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}",
-        method="DELETE",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_delete_vector_store(
-        self,
-        vector_store_id: str,
-    ) -> VectorStoreDeleteResponse:
-        """Delete a vector store.
-
-        :param vector_store_id: The ID of the vector store to delete.
-        :returns: A VectorStoreDeleteResponse indicating the deletion status.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/search",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_search_vector_store(
-        self,
-        vector_store_id: str,
-        query: str | list[str],
-        filters: dict[str, Any] | None = None,
-        max_num_results: int | None = 10,
-        ranking_options: SearchRankingOptions | None = None,
-        rewrite_query: bool | None = False,
-        search_mode: (
-            str | None
-        ) = "vector",  # Using str instead of Literal due to OpenAPI schema generator limitations
-    ) -> VectorStoreSearchResponsePage:
-        """Search for chunks in a vector store.
-
-        Searches a vector store for relevant chunks based on a query and optional file attribute filters.
-
-        :param vector_store_id: The ID of the vector store to search.
-        :param query: The query string or array for performing the search.
-        :param filters: Filters based on file attributes to narrow the search results.
-        :param max_num_results: Maximum number of results to return (1 to 50 inclusive, default 10).
-        :param ranking_options: Ranking options for fine-tuning the search results.
-        :param rewrite_query: Whether to rewrite the natural language query for vector search (default false)
-        :param search_mode: The search mode to use - "keyword", "vector", or "hybrid" (default "vector")
-        :returns: A VectorStoreSearchResponse containing the search results.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_attach_file_to_vector_store(
-        self,
-        vector_store_id: str,
-        file_id: str,
-        attributes: dict[str, Any] | None = None,
-        chunking_strategy: VectorStoreChunkingStrategy | None = None,
-    ) -> VectorStoreFileObject:
-        """Attach a file to a vector store.
-
-        :param vector_store_id: The ID of the vector store to attach the file to.
-        :param file_id: The ID of the file to attach to the vector store.
-        :param attributes: The key-value attributes stored with the file, which can be used for filtering.
-        :param chunking_strategy: The chunking strategy to use for the file.
-        :returns: A VectorStoreFileObject representing the attached file.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_list_files_in_vector_store(
-        self,
-        vector_store_id: str,
-        limit: int | None = 20,
-        order: str | None = "desc",
-        after: str | None = None,
-        before: str | None = None,
-        filter: VectorStoreFileStatus | None = None,
-    ) -> VectorStoreListFilesResponse:
-        """List files in a vector store.
-
-        :param vector_store_id: The ID of the vector store to list files from.
-        :param limit: (Optional) A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
-        :param order: (Optional) Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.
-        :param after: (Optional) A cursor for use in pagination. `after` is an object ID that defines your place in the list.
-        :param before: (Optional) A cursor for use in pagination. `before` is an object ID that defines your place in the list.
-        :param filter: (Optional) Filter by file status to only return files with the specified status.
-        :returns: A VectorStoreListFilesResponse containing the list of files.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files/{file_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_retrieve_vector_store_file(
-        self,
-        vector_store_id: str,
-        file_id: str,
-    ) -> VectorStoreFileObject:
-        """Retrieves a vector store file.
-
-        :param vector_store_id: The ID of the vector store containing the file to retrieve.
-        :param file_id: The ID of the file to retrieve.
-        :returns: A VectorStoreFileObject representing the file.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files/{file_id}/content",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_retrieve_vector_store_file_contents(
-        self,
-        vector_store_id: str,
-        file_id: str,
-    ) -> VectorStoreFileContentsResponse:
-        """Retrieves the contents of a vector store file.
-
-        :param vector_store_id: The ID of the vector store containing the file to retrieve.
-        :param file_id: The ID of the file to retrieve.
-        :returns: A list of InterleavedContent representing the file contents.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files/{file_id}",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_update_vector_store_file(
-        self,
-        vector_store_id: str,
-        file_id: str,
-        attributes: dict[str, Any],
-    ) -> VectorStoreFileObject:
-        """Updates a vector store file.
-
-        :param vector_store_id: The ID of the vector store containing the file to update.
-        :param file_id: The ID of the file to update.
-        :param attributes: The updated key-value attributes to store with the file.
-        :returns: A VectorStoreFileObject representing the updated file.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files/{file_id}",
-        method="DELETE",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_delete_vector_store_file(
-        self,
-        vector_store_id: str,
-        file_id: str,
-    ) -> VectorStoreFileDeleteResponse:
-        """Delete a vector store file.
-
-        :param vector_store_id: The ID of the vector store containing the file to delete.
-        :param file_id: The ID of the file to delete.
-        :returns: A VectorStoreFileDeleteResponse indicating the deletion status.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/file_batches",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_create_vector_store_file_batch(
-        self,
-        vector_store_id: str,
-        params: Annotated[OpenAICreateVectorStoreFileBatchRequestWithExtraBody, Body(...)],
-    ) -> VectorStoreFileBatchObject:
-        """Create a vector store file batch.
-
-        Generate an OpenAI-compatible vector store file batch for the given vector store.
-        :param vector_store_id: The ID of the vector store to create the file batch for.
-        :returns: A VectorStoreFileBatchObject representing the created file batch.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_retrieve_vector_store_file_batch(
-        self,
-        batch_id: str,
-        vector_store_id: str,
-    ) -> VectorStoreFileBatchObject:
-        """Retrieve a vector store file batch.
-
-        :param batch_id: The ID of the file batch to retrieve.
-        :param vector_store_id: The ID of the vector store containing the file batch.
-        :returns: A VectorStoreFileBatchObject representing the file batch.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}/files",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_list_files_in_vector_store_file_batch(
-        self,
-        batch_id: str,
-        vector_store_id: str,
-        after: str | None = None,
-        before: str | None = None,
-        filter: str | None = None,
-        limit: int | None = 20,
-        order: str | None = "desc",
-    ) -> VectorStoreFilesListInBatchResponse:
-        """Returns a list of vector store files in a batch.
-
-        :param batch_id: The ID of the file batch to list files from.
-        :param vector_store_id: The ID of the vector store containing the file batch.
-        :param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list.
-        :param before: A cursor for use in pagination. `before` is an object ID that defines your place in the list.
-        :param filter: Filter by file status. One of in_progress, completed, failed, cancelled.
-        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
-        :param order: Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.
-        :returns: A VectorStoreFilesListInBatchResponse containing the list of files in the batch.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_cancel_vector_store_file_batch(
-        self,
-        batch_id: str,
-        vector_store_id: str,
-    ) -> VectorStoreFileBatchObject:
-        """Cancels a vector store file batch.
-
-        :param batch_id: The ID of the file batch to cancel.
-        :param vector_store_id: The ID of the vector store containing the file batch.
-        :returns: A VectorStoreFileBatchObject representing the cancelled file batch.
-        """
-        ...
--- a/src/llama_stack/apis/vector_stores/init.py
+++ b/src/llama_stack/apis/vector_stores/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .vector_stores import *
--- a/src/llama_stack/apis/vector_stores/vector_stores.py
+++ b/src/llama_stack/apis/vector_stores/vector_stores.py
@ -1,51 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Literal
-
-from pydantic import BaseModel
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-# Internal resource type for storing the vector store routing and other information
-class VectorStore(Resource):
-    """Vector database resource for storing and querying vector embeddings.
-
-    :param type: Type of resource, always 'vector_store' for vector stores
-    :param embedding_model: Name of the embedding model to use for vector generation
-    :param embedding_dimension: Dimension of the embedding vectors
-    """
-
-    type: Literal[ResourceType.vector_store] = ResourceType.vector_store
-
-    embedding_model: str
-    embedding_dimension: int
-    vector_store_name: str | None = None
-
-    @property
-    def vector_store_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_vector_store_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class VectorStoreInput(BaseModel):
-    """Input parameters for creating or configuring a vector database.
-
-    :param vector_store_id: Unique identifier for the vector store
-    :param embedding_model: Name of the embedding model to use for vector generation
-    :param embedding_dimension: Dimension of the embedding vectors
-    :param provider_vector_store_id: (Optional) Provider-specific identifier for the vector store
-    """
-
-    vector_store_id: str
-    embedding_model: str
-    embedding_dimension: int
-    provider_id: str | None = None
-    provider_vector_store_id: str | None = None
--- a/src/llama_stack/apis/version.py
+++ b/src/llama_stack/apis/version.py
@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-LLAMA_STACK_API_V1 = "v1"
-LLAMA_STACK_API_V1BETA = "v1beta"
-LLAMA_STACK_API_V1ALPHA = "v1alpha"
--- a/src/llama_stack/cli/stack/_list_deps.py
+++ b/src/llama_stack/cli/stack/_list_deps.py
@ -21,7 +21,7 @@ from llama_stack.core.datatypes import (
 from llama_stack.core.distribution import get_provider_registry
 from llama_stack.core.stack import replace_env_vars
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates"

--- a/src/llama_stack/cli/stack/list_deps.py
+++ b/src/llama_stack/cli/stack/list_deps.py
@ -46,6 +46,10 @@ class StackListDeps(Subcommand):
    def _run_stack_list_deps_command(self, args: argparse.Namespace) -> None:
        # always keep implementation completely silo-ed away from CLI so CLI
        # can be fast to load and reduces dependencies
+        if not args.config and not args.providers:
+            self.parser.print_help()
+            self.parser.exit()
+
        from ._list_deps import run_stack_list_deps_command

        return run_stack_list_deps_command(args)
--- a/src/llama_stack/cli/stack/list_stacks.py
+++ b/src/llama_stack/cli/stack/list_stacks.py
@ -9,48 +9,69 @@ from pathlib import Path

 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.cli.table import print_table
+from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR


 class StackListBuilds(Subcommand):
-    """List built stacks in .llama/distributions directory"""
+    """List available distributions (both built-in and custom)"""

    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "list",
            prog="llama stack list",
-            description="list the build stacks",
+            description="list available distributions",
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        )
        self._add_arguments()
        self.parser.set_defaults(func=self._list_stack_command)

-    def _get_distribution_dirs(self) -> dict[str, Path]:
-        """Return a dictionary of distribution names and their paths"""
-        distributions = {}
-        dist_dir = Path.home() / ".llama" / "distributions"
+    def _get_distribution_dirs(self) -> dict[str, tuple[Path, str]]:
+        """Return a dictionary of distribution names and their paths with source type
+
+        Returns:
+            dict mapping distro name to (path, source_type) where source_type is 'built-in' or 'custom'
+        """
+        distributions = {}
+
+        # Get built-in distributions from source code
+        distro_dir = Path(__file__).parent.parent.parent / "distributions"
+        if distro_dir.exists():
+            for stack_dir in distro_dir.iterdir():
+                if stack_dir.is_dir() and not stack_dir.name.startswith(".") and not stack_dir.name.startswith("__"):
+                    distributions[stack_dir.name] = (stack_dir, "built-in")
+
+        # Get custom/run distributions from ~/.llama/distributions
+        # These override built-in ones if they have the same name
+        if DISTRIBS_BASE_DIR.exists():
+            for stack_dir in DISTRIBS_BASE_DIR.iterdir():
+                if stack_dir.is_dir() and not stack_dir.name.startswith("."):
+                    # Clean up the name (remove llamastack- prefix if present)
+                    name = stack_dir.name.replace("llamastack-", "")
+                    distributions[name] = (stack_dir, "custom")

-        if dist_dir.exists():
-            for stack_dir in dist_dir.iterdir():
-                if stack_dir.is_dir():
-                    distributions[stack_dir.name] = stack_dir
        return distributions

    def _list_stack_command(self, args: argparse.Namespace) -> None:
        distributions = self._get_distribution_dirs()

        if not distributions:
-            print("No stacks found in ~/.llama/distributions")
+            print("No distributions found")
            return

-        headers = ["Stack Name", "Path"]
-        headers.extend(["Build Config", "Run Config"])
+        headers = ["Stack Name", "Source", "Path", "Build Config", "Run Config"]
        rows = []
-        for name, path in distributions.items():
-            row = [name, str(path)]
+        for name, (path, source_type) in sorted(distributions.items()):
+            row = [name, source_type, str(path)]
            # Check for build and run config files
-            build_config = "Yes" if (path / f"{name}-build.yaml").exists() else "No"
-            run_config = "Yes" if (path / f"{name}-run.yaml").exists() else "No"
+            # For built-in distributions, configs are named build.yaml and run.yaml
+            # For custom distributions, configs are named {name}-build.yaml and {name}-run.yaml
+            if source_type == "built-in":
+                build_config = "Yes" if (path / "build.yaml").exists() else "No"
+                run_config = "Yes" if (path / "run.yaml").exists() else "No"
+            else:
+                build_config = "Yes" if (path / f"{name}-build.yaml").exists() else "No"
+                run_config = "Yes" if (path / f"{name}-run.yaml").exists() else "No"
            row.extend([build_config, run_config])
            rows.append(row)
        print_table(rows, headers, separate_rows=True)
--- a/src/llama_stack/cli/stack/run.py
+++ b/src/llama_stack/cli/stack/run.py
@ -393,7 +393,7 @@ class StackRun(Subcommand):
            )
            return

-        ui_dir = REPO_ROOT / "llama_stack" / "ui"
+        ui_dir = REPO_ROOT / "llama_stack_ui"
        logs_dir = Path("~/.llama/ui/logs").expanduser()
        try:
            # Create logs directory if it doesn't exist
--- a/src/llama_stack/cli/stack/utils.py
+++ b/src/llama_stack/cli/stack/utils.py
@ -32,7 +32,7 @@ from llama_stack.core.storage.datatypes import (
 from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR, EXTERNAL_PROVIDERS_DIR
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.image_types import LlamaStackImageType
-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 TEMPLATES_PATH = Path(__file__).parent.parent.parent / "distributions"

--- a/src/llama_stack/core/build.py
+++ b/src/llama_stack/core/build.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import importlib.resources
 import sys

 from pydantic import BaseModel
@ -12,12 +11,9 @@ from termcolor import cprint

 from llama_stack.core.datatypes import BuildConfig
 from llama_stack.core.distribution import get_provider_registry
-from llama_stack.core.external import load_external_apis
-from llama_stack.core.utils.exec import run_command
-from llama_stack.core.utils.image_types import LlamaStackImageType
 from llama_stack.distributions.template import DistributionTemplate
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 log = get_logger(name=__name__, category="core")

@ -101,64 +97,3 @@ def print_pip_install_help(config: BuildConfig):
    for special_dep in special_deps:
        cprint(f"uv pip install {special_dep}", color="yellow", file=sys.stderr)
    print()
-
-
-def build_image(
-    build_config: BuildConfig,
-    image_name: str,
-    distro_or_config: str,
-    run_config: str | None = None,
-):
-    container_base = build_config.distribution_spec.container_image or "python:3.12-slim"
-
-    normal_deps, special_deps, external_provider_deps = get_provider_dependencies(build_config)
-    normal_deps += SERVER_DEPENDENCIES
-    if build_config.external_apis_dir:
-        external_apis = load_external_apis(build_config)
-        if external_apis:
-            for _, api_spec in external_apis.items():
-                normal_deps.extend(api_spec.pip_packages)
-
-    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
-        script = str(importlib.resources.files("llama_stack") / "core/build_container.sh")
-        args = [
-            script,
-            "--distro-or-config",
-            distro_or_config,
-            "--image-name",
-            image_name,
-            "--container-base",
-            container_base,
-            "--normal-deps",
-            " ".join(normal_deps),
-        ]
-        # When building from a config file (not a template), include the run config path in the
-        # build arguments
-        if run_config is not None:
-            args.extend(["--run-config", run_config])
-    else:
-        script = str(importlib.resources.files("llama_stack") / "core/build_venv.sh")
-        args = [
-            script,
-            "--env-name",
-            str(image_name),
-            "--normal-deps",
-            " ".join(normal_deps),
-        ]
-
-    # Always pass both arguments, even if empty, to maintain consistent positional arguments
-    if special_deps:
-        args.extend(["--optional-deps", "#".join(special_deps)])
-    if external_provider_deps:
-        args.extend(
-            ["--external-provider-deps", "#".join(external_provider_deps)]
-        )  # the script will install external provider module, get its deps, and install those too.
-
-    return_code = run_command(args)
-
-    if return_code != 0:
-        log.error(
-            f"Failed to build target {image_name} with return code {return_code}",
-        )
-
-    return return_code
--- a/src/llama_stack/core/client.py
+++ b/src/llama_stack/core/client.py
@ -15,7 +15,7 @@ import httpx
 from pydantic import BaseModel, parse_obj_as
 from termcolor import cprint

-from llama_stack.providers.datatypes import RemoteProviderConfig
+from llama_stack_api import RemoteProviderConfig

 _CLIENT_CLASSES = {}

--- a/src/llama_stack/core/configure.py
+++ b/src/llama_stack/core/configure.py
@ -20,7 +20,7 @@ from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.prompt_for_config import prompt_for_config
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack_api import Api, ProviderSpec

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/conversations/conversations.py
+++ b/src/llama_stack/core/conversations/conversations.py
@ -10,7 +10,11 @@ from typing import Any, Literal

 from pydantic import BaseModel, TypeAdapter

-from llama_stack.apis.conversations.conversations import (
+from llama_stack.core.datatypes import AccessRule, StackRunConfig
+from llama_stack.core.storage.sqlstore.authorized_sqlstore import AuthorizedSqlStore
+from llama_stack.core.storage.sqlstore.sqlstore import sqlstore_impl
+from llama_stack.log import get_logger
+from llama_stack_api import (
    Conversation,
    ConversationDeletedResource,
    ConversationItem,
@ -20,11 +24,7 @@ from llama_stack.apis.conversations.conversations import (
    Conversations,
    Metadata,
 )
-from llama_stack.core.datatypes import AccessRule, StackRunConfig
-from llama_stack.log import get_logger
-from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
-from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl
+from llama_stack_api.internal.sqlstore import ColumnDefinition, ColumnType

 logger = get_logger(name=__name__, category="openai_conversations")

@ -203,16 +203,11 @@ class ConversationServiceImpl(Conversations):
                "item_data": item_dict,
            }

-            # TODO: Add support for upsert in sql_store, this will fail first if ID exists and then update
-            try:
-                await self.sql_store.insert(table="conversation_items", data=item_record)
-            except Exception:
-                # If insert fails due to ID conflict, update existing record
-                await self.sql_store.update(
-                    table="conversation_items",
-                    data={"created_at": created_at, "item_data": item_dict},
-                    where={"id": item_id},
-                )
+            await self.sql_store.upsert(
+                table="conversation_items",
+                data=item_record,
+                conflict_columns=["id"],
+            )

            created_items.append(item_dict)

--- a/src/llama_stack/core/datatypes.py
+++ b/src/llama_stack/core/datatypes.py
@ -11,20 +11,6 @@ from urllib.parse import urlparse

 from pydantic import BaseModel, Field, field_validator, model_validator

-from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Dataset, DatasetInput
-from llama_stack.apis.eval import Eval
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.models import Model, ModelInput
-from llama_stack.apis.resource import Resource
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnInput
-from llama_stack.apis.shields import Shield, ShieldInput
-from llama_stack.apis.tools import ToolGroup, ToolGroupInput, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
-from llama_stack.apis.vector_stores import VectorStore, VectorStoreInput
 from llama_stack.core.access_control.datatypes import AccessRule
 from llama_stack.core.storage.datatypes import (
    KVStoreReference,
@ -32,7 +18,32 @@ from llama_stack.core.storage.datatypes import (
    StorageConfig,
 )
 from llama_stack.log import LoggingConfig
-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack_api import (
+    Api,
+    Benchmark,
+    BenchmarkInput,
+    Dataset,
+    DatasetInput,
+    DatasetIO,
+    Eval,
+    Inference,
+    Model,
+    ModelInput,
+    ProviderSpec,
+    Resource,
+    Safety,
+    Scoring,
+    ScoringFn,
+    ScoringFnInput,
+    Shield,
+    ShieldInput,
+    ToolGroup,
+    ToolGroupInput,
+    ToolRuntime,
+    VectorIO,
+    VectorStore,
+    VectorStoreInput,
+)

 LLAMA_STACK_BUILD_CONFIG_VERSION = 2
 LLAMA_STACK_RUN_CONFIG_VERSION = 2
--- a/src/llama_stack/core/distribution.py
+++ b/src/llama_stack/core/distribution.py
@ -15,7 +15,7 @@ from pydantic import BaseModel
 from llama_stack.core.datatypes import BuildConfig, DistributionSpec
 from llama_stack.core.external import load_external_apis
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    InlineProviderSpec,
    ProviderSpec,
--- a/src/llama_stack/core/external.py
+++ b/src/llama_stack/core/external.py
@ -7,9 +7,9 @@

 import yaml

-from llama_stack.apis.datatypes import Api, ExternalApiSpec
 from llama_stack.core.datatypes import BuildConfig, StackRunConfig
 from llama_stack.log import get_logger
+from llama_stack_api import Api, ExternalApiSpec

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/inspect.py
+++ b/src/llama_stack/core/inspect.py
@ -8,18 +8,17 @@ from importlib.metadata import version

 from pydantic import BaseModel

-from llama_stack.apis.inspect import (
+from llama_stack.core.datatypes import StackRunConfig
+from llama_stack.core.external import load_external_apis
+from llama_stack.core.server.routes import get_all_api_routes
+from llama_stack_api import (
    HealthInfo,
+    HealthStatus,
    Inspect,
    ListRoutesResponse,
    RouteInfo,
    VersionInfo,
 )
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.datatypes import StackRunConfig
-from llama_stack.core.external import load_external_apis
-from llama_stack.core.server.routes import get_all_api_routes
-from llama_stack.providers.datatypes import HealthStatus


 class DistributionInspectConfig(BaseModel):
@ -46,8 +45,8 @@ class DistributionInspectImpl(Inspect):
        # Helper function to determine if a route should be included based on api_filter
        def should_include_route(webmethod) -> bool:
            if api_filter is None:
-                # Default: only non-deprecated v1 APIs
-                return not webmethod.deprecated and webmethod.level == LLAMA_STACK_API_V1
+                # Default: only non-deprecated APIs
+                return not webmethod.deprecated
            elif api_filter == "deprecated":
                # Special filter: show deprecated routes regardless of their actual level
                return bool(webmethod.deprecated)
--- a/src/llama_stack/core/library_client.py
+++ b/src/llama_stack/core/library_client.py
@ -18,14 +18,23 @@ from typing import Any, TypeVar, Union, get_args, get_origin
 import httpx
 import yaml
 from fastapi import Response as FastAPIResponse
-from llama_stack_client import (
-    NOT_GIVEN,
-    APIResponse,
-    AsyncAPIResponse,
-    AsyncLlamaStackClient,
-    AsyncStream,
-    LlamaStackClient,
-)
+
+from llama_stack.core.utils.type_inspection import is_unwrapped_body_param
+
+try:
+    from llama_stack_client import (
+        NOT_GIVEN,
+        APIResponse,
+        AsyncAPIResponse,
+        AsyncLlamaStackClient,
+        AsyncStream,
+        LlamaStackClient,
+    )
+except ImportError as e:
+    raise ImportError(
+        "llama-stack-client is not installed. Please install it with `uv pip install llama-stack[client]`."
+    ) from e
+
 from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
 from termcolor import cprint
@ -33,24 +42,16 @@ from termcolor import cprint
 from llama_stack.core.build import print_pip_install_help
 from llama_stack.core.configure import parse_and_maybe_upgrade_config
 from llama_stack.core.datatypes import BuildConfig, BuildProvider, DistributionSpec
-from llama_stack.core.request_headers import (
-    PROVIDER_DATA_VAR,
-    request_provider_data_context,
-)
+from llama_stack.core.request_headers import PROVIDER_DATA_VAR, request_provider_data_context
 from llama_stack.core.resolver import ProviderRegistry
 from llama_stack.core.server.routes import RouteImpls, find_matching_route, initialize_route_impls
-from llama_stack.core.stack import (
-    Stack,
-    get_stack_run_config_from_distro,
-    replace_env_vars,
-)
+from llama_stack.core.stack import Stack, get_stack_run_config_from_distro, replace_env_vars
 from llama_stack.core.telemetry import Telemetry
 from llama_stack.core.telemetry.tracing import CURRENT_TRACE_CONTEXT, end_trace, setup_logger, start_trace
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.core.utils.exec import in_notebook
 from llama_stack.log import get_logger, setup_logging
-from llama_stack.strong_typing.inspection import is_unwrapped_body_param

 logger = get_logger(name=__name__, category="core")

@ -382,6 +383,12 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        matched_func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
        body |= path_params

+        # Pass through params that aren't already handled as path params
+        if options.params:
+            extra_query_params = {k: v for k, v in options.params.items() if k not in path_params}
+            if extra_query_params:
+                body["extra_query"] = extra_query_params
+
        body, field_names = self._handle_file_uploads(options, body)

        body = self._convert_body(matched_func, body, exclude_params=set(field_names))
--- a/src/llama_stack/core/prompts/prompts.py
+++ b/src/llama_stack/core/prompts/prompts.py
@ -9,9 +9,9 @@ from typing import Any

 from pydantic import BaseModel

-from llama_stack.apis.prompts import ListPromptsResponse, Prompt, Prompts
 from llama_stack.core.datatypes import StackRunConfig
-from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
+from llama_stack.core.storage.kvstore import KVStore, kvstore_impl
+from llama_stack_api import ListPromptsResponse, Prompt, Prompts


 class PromptServiceConfig(BaseModel):
--- a/src/llama_stack/core/providers.py
+++ b/src/llama_stack/core/providers.py
@ -9,9 +9,8 @@ from typing import Any

 from pydantic import BaseModel

-from llama_stack.apis.providers import ListProvidersResponse, ProviderInfo, Providers
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus
+from llama_stack_api import HealthResponse, HealthStatus, ListProvidersResponse, ProviderInfo, Providers

 from .datatypes import StackRunConfig
 from .utils.config import redact_sensitive_fields
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -8,29 +8,6 @@ import importlib.metadata
 import inspect
 from typing import Any

-from llama_stack.apis.agents import Agents
-from llama_stack.apis.batches import Batches
-from llama_stack.apis.benchmarks import Benchmarks
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.datatypes import ExternalApiSpec
-from llama_stack.apis.eval import Eval
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference, InferenceProvider
-from llama_stack.apis.inspect import Inspect
-from llama_stack.apis.models import Models
-from llama_stack.apis.post_training import PostTraining
-from llama_stack.apis.prompts import Prompts
-from llama_stack.apis.providers import Providers as ProvidersAPI
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFunctions
-from llama_stack.apis.shields import Shields
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.core.client import get_client_impl
 from llama_stack.core.datatypes import (
    AccessRule,
@ -44,17 +21,44 @@ from llama_stack.core.external import load_external_apis
 from llama_stack.core.store import DistributionRegistry
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
+    LLAMA_STACK_API_V1ALPHA,
+    Agents,
    Api,
+    Batches,
+    Benchmarks,
    BenchmarksProtocolPrivate,
+    Conversations,
+    DatasetIO,
+    Datasets,
    DatasetsProtocolPrivate,
+    Eval,
+    ExternalApiSpec,
+    Files,
+    Inference,
+    InferenceProvider,
+    Inspect,
+    Models,
    ModelsProtocolPrivate,
+    PostTraining,
+    Prompts,
    ProviderSpec,
    RemoteProviderConfig,
    RemoteProviderSpec,
+    Safety,
+    Scoring,
+    ScoringFunctions,
    ScoringFunctionsProtocolPrivate,
+    Shields,
    ShieldsProtocolPrivate,
+    ToolGroups,
    ToolGroupsProtocolPrivate,
+    ToolRuntime,
+    VectorIO,
+    VectorStore,
+)
+from llama_stack_api import (
+    Providers as ProvidersAPI,
 )

 logger = get_logger(name=__name__, category="core")
@ -397,6 +401,18 @@ async def instantiate_provider(
    impl.__provider_spec__ = provider_spec
    impl.__provider_config__ = config

+    # Apply tracing if telemetry is enabled and any base class has __marked_for_tracing__ marker
+    if run_config.telemetry.enabled:
+        traced_classes = [
+            base for base in reversed(impl.__class__.__mro__) if getattr(base, "__marked_for_tracing__", False)
+        ]
+
+        if traced_classes:
+            from llama_stack.core.telemetry.trace_protocol import trace_protocol
+
+            for cls in traced_classes:
+                trace_protocol(cls)
+
    protocols = api_protocol_map_for_compliance_check(run_config)
    additional_protocols = additional_protocols_map()
    # TODO: check compliance for special tool groups
--- a/src/llama_stack/core/routers/init.py
+++ b/src/llama_stack/core/routers/init.py
@ -12,8 +12,8 @@ from llama_stack.core.datatypes import (
 )
 from llama_stack.core.stack import StackRunConfig
 from llama_stack.core.store import DistributionRegistry
-from llama_stack.providers.datatypes import Api, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
+from llama_stack_api import Api, RoutingTable


 async def get_routing_table_impl(
@ -45,6 +45,7 @@ async def get_routing_table_impl(
        raise ValueError(f"API {api.value} not found in router map")

    impl = api_to_tables[api.value](impls_by_provider_id, dist_registry, policy)
+
    await impl.initialize()
    return impl

@ -92,5 +93,6 @@ async def get_auto_router_impl(
        api_to_dep_impl["safety_config"] = run_config.safety

    impl = api_to_routers[api.value](routing_table, **api_to_dep_impl)
+
    await impl.initialize()
    return impl
--- a/src/llama_stack/core/routers/datasets.py
+++ b/src/llama_stack/core/routers/datasets.py
@ -6,11 +6,8 @@

 from typing import Any

-from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import DatasetPurpose, DataSource
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import RoutingTable
+from llama_stack_api import DatasetIO, DatasetPurpose, DataSource, PaginatedResponse, RoutingTable

 logger = get_logger(name=__name__, category="core::routers")

--- a/src/llama_stack/core/routers/eval_scoring.py
+++ b/src/llama_stack/core/routers/eval_scoring.py
@ -6,15 +6,18 @@

 from typing import Any

-from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job
-from llama_stack.apis.scoring import (
+from llama_stack.log import get_logger
+from llama_stack_api import (
+    BenchmarkConfig,
+    Eval,
+    EvaluateResponse,
+    Job,
+    RoutingTable,
    ScoreBatchResponse,
    ScoreResponse,
    Scoring,
    ScoringFnParams,
 )
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import RoutingTable

 logger = get_logger(name=__name__, category="core::routers")

--- a/src/llama_stack/core/routers/inference.py
+++ b/src/llama_stack/core/routers/inference.py
@ -15,13 +15,25 @@ from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatC
 from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam
 from pydantic import TypeAdapter

-from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
-from llama_stack.apis.inference import (
+from llama_stack.core.telemetry.telemetry import MetricEvent
+from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span
+from llama_stack.log import get_logger
+from llama_stack.models.llama.llama3.chat_format import ChatFormat
+from llama_stack.models.llama.llama3.tokenizer import Tokenizer
+from llama_stack.providers.utils.inference.inference_store import InferenceStore
+from llama_stack_api import (
+    HealthResponse,
+    HealthStatus,
    Inference,
    ListOpenAIChatCompletionResponse,
+    ModelNotFoundError,
+    ModelType,
+    ModelTypeError,
    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam,
    OpenAIChatCompletionRequestWithExtraBody,
    OpenAIChatCompletionToolCall,
    OpenAIChatCompletionToolCallFunction,
@ -35,19 +47,8 @@ from llama_stack.apis.inference import (
    OpenAIMessageParam,
    Order,
    RerankResponse,
+    RoutingTable,
 )
-from llama_stack.apis.inference.inference import (
-    OpenAIChatCompletionContentPartImageParam,
-    OpenAIChatCompletionContentPartTextParam,
-)
-from llama_stack.apis.models import ModelType
-from llama_stack.core.telemetry.telemetry import MetricEvent
-from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span
-from llama_stack.log import get_logger
-from llama_stack.models.llama.llama3.chat_format import ChatFormat
-from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
-from llama_stack.providers.utils.inference.inference_store import InferenceStore

 logger = get_logger(name=__name__, category="core::routers")

@ -190,7 +191,7 @@ class InferenceRouter(Inference):

        response = await provider.openai_completion(params)
        response.model = request_model_id
-        if self.telemetry_enabled:
+        if self.telemetry_enabled and response.usage is not None:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
@ -253,7 +254,7 @@ class InferenceRouter(Inference):
        if self.store:
            asyncio.create_task(self.store.store_chat_completion(response, params.messages))

-        if self.telemetry_enabled:
+        if self.telemetry_enabled and response.usage is not None:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
@ -416,7 +417,7 @@ class InferenceRouter(Inference):
                            prompt_tokens=chunk.usage.prompt_tokens,
                            completion_tokens=chunk.usage.completion_tokens,
                            total_tokens=chunk.usage.total_tokens,
-                            model_id=fully_qualified_model_id,
+                            fully_qualified_model_id=fully_qualified_model_id,
                            provider_id=provider_id,
                        )
                        for metric in metrics:
--- a/src/llama_stack/core/routers/safety.py
+++ b/src/llama_stack/core/routers/safety.py
@ -6,13 +6,9 @@

 from typing import Any

-from llama_stack.apis.inference import Message
-from llama_stack.apis.safety import RunShieldResponse, Safety
-from llama_stack.apis.safety.safety import ModerationObject
-from llama_stack.apis.shields import Shield
 from llama_stack.core.datatypes import SafetyConfig
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import RoutingTable
+from llama_stack_api import ModerationObject, OpenAIMessageParam, RoutingTable, RunShieldResponse, Safety, Shield

 logger = get_logger(name=__name__, category="core::routers")

@ -52,7 +48,7 @@ class SafetyRouter(Safety):
    async def run_shield(
        self,
        shield_id: str,
-        messages: list[Message],
+        messages: list[OpenAIMessageParam],
        params: dict[str, Any] = None,
    ) -> RunShieldResponse:
        logger.debug(f"SafetyRouter.run_shield: {shield_id}")
--- a/src/llama_stack/core/routers/tool_runtime.py
+++ b/src/llama_stack/core/routers/tool_runtime.py
@ -6,19 +6,12 @@

 from typing import Any

-from llama_stack.apis.common.content_types import (
+from llama_stack.log import get_logger
+from llama_stack_api import (
    URL,
-    InterleavedContent,
-)
-from llama_stack.apis.tools import (
    ListToolDefsResponse,
-    RAGDocument,
-    RAGQueryConfig,
-    RAGQueryResult,
-    RAGToolRuntime,
    ToolRuntime,
 )
-from llama_stack.log import get_logger

 from ..routing_tables.toolgroups import ToolGroupsRoutingTable

@ -26,36 +19,6 @@ logger = get_logger(name=__name__, category="core::routers")


 class ToolRuntimeRouter(ToolRuntime):
-    class RagToolImpl(RAGToolRuntime):
-        def __init__(
-            self,
-            routing_table: ToolGroupsRoutingTable,
-        ) -> None:
-            logger.debug("Initializing ToolRuntimeRouter.RagToolImpl")
-            self.routing_table = routing_table
-
-        async def query(
-            self,
-            content: InterleavedContent,
-            vector_store_ids: list[str],
-            query_config: RAGQueryConfig | None = None,
-        ) -> RAGQueryResult:
-            logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_store_ids}")
-            provider = await self.routing_table.get_provider_impl("knowledge_search")
-            return await provider.query(content, vector_store_ids, query_config)
-
-        async def insert(
-            self,
-            documents: list[RAGDocument],
-            vector_store_id: str,
-            chunk_size_in_tokens: int = 512,
-        ) -> None:
-            logger.debug(
-                f"ToolRuntimeRouter.RagToolImpl.insert: {vector_store_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
-            )
-            provider = await self.routing_table.get_provider_impl("insert_into_memory")
-            return await provider.insert(documents, vector_store_id, chunk_size_in_tokens)
-
    def __init__(
        self,
        routing_table: ToolGroupsRoutingTable,
@ -63,11 +26,6 @@ class ToolRuntimeRouter(ToolRuntime):
        logger.debug("Initializing ToolRuntimeRouter")
        self.routing_table = routing_table

-        # HACK ALERT this should be in sync with "get_all_api_endpoints()"
-        self.rag_tool = self.RagToolImpl(routing_table)
-        for method in ("query", "insert"):
-            setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
-
    async def initialize(self) -> None:
        logger.debug("ToolRuntimeRouter.initialize")
        pass
@ -76,16 +34,16 @@ class ToolRuntimeRouter(ToolRuntime):
        logger.debug("ToolRuntimeRouter.shutdown")
        pass

-    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> Any:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any], authorization: str | None = None) -> Any:
        logger.debug(f"ToolRuntimeRouter.invoke_tool: {tool_name}")
        provider = await self.routing_table.get_provider_impl(tool_name)
        return await provider.invoke_tool(
            tool_name=tool_name,
            kwargs=kwargs,
+            authorization=authorization,
        )

    async def list_runtime_tools(
-        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None, authorization: str | None = None
    ) -> ListToolDefsResponse:
-        logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
-        return await self.routing_table.list_tools(tool_group_id)
+        return await self.routing_table.list_tools(tool_group_id, authorization=authorization)
--- a/src/llama_stack/core/routers/vector_io.py
+++ b/src/llama_stack/core/routers/vector_io.py
@ -10,19 +10,28 @@ from typing import Annotated, Any

 from fastapi import Body

-from llama_stack.apis.common.content_types import InterleavedContent
-from llama_stack.apis.models import ModelType
-from llama_stack.apis.vector_io import (
+from llama_stack.core.datatypes import VectorStoresConfig
+from llama_stack.log import get_logger
+from llama_stack_api import (
    Chunk,
+    HealthResponse,
+    HealthStatus,
+    InterleavedContent,
+    ModelNotFoundError,
+    ModelType,
+    ModelTypeError,
    OpenAICreateVectorStoreFileBatchRequestWithExtraBody,
    OpenAICreateVectorStoreRequestWithExtraBody,
    QueryChunksResponse,
+    RoutingTable,
    SearchRankingOptions,
    VectorIO,
    VectorStoreChunkingStrategy,
+    VectorStoreChunkingStrategyStatic,
+    VectorStoreChunkingStrategyStaticConfig,
    VectorStoreDeleteResponse,
    VectorStoreFileBatchObject,
-    VectorStoreFileContentsResponse,
+    VectorStoreFileContentResponse,
    VectorStoreFileDeleteResponse,
    VectorStoreFileObject,
    VectorStoreFilesListInBatchResponse,
@ -31,9 +40,6 @@ from llama_stack.apis.vector_io import (
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
-from llama_stack.core.datatypes import VectorStoresConfig
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable

 logger = get_logger(name=__name__, category="core::routers")

@ -120,6 +126,14 @@ class VectorIORouter(VectorIO):
        if embedding_model is not None and embedding_dimension is None:
            embedding_dimension = await self._get_embedding_model_dimension(embedding_model)

+        # Validate that embedding model exists and is of the correct type
+        if embedding_model is not None:
+            model = await self.routing_table.get_object_by_identifier("model", embedding_model)
+            if model is None:
+                raise ModelNotFoundError(embedding_model)
+            if model.model_type != ModelType.embedding:
+                raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
+
        # Auto-select provider if not specified
        if provider_id is None:
            num_providers = len(self.routing_table.impls_by_provider_id)
@ -167,6 +181,13 @@ class VectorIORouter(VectorIO):
        if embedding_dimension is not None:
            params.model_extra["embedding_dimension"] = embedding_dimension

+        # Set chunking strategy explicitly if not provided
+        if params.chunking_strategy is None or params.chunking_strategy.type == "auto":
+            # actualize the chunking strategy to static
+            params.chunking_strategy = VectorStoreChunkingStrategyStatic(
+                static=VectorStoreChunkingStrategyStaticConfig()
+            )
+
        return await provider.openai_create_vector_store(params)

    async def openai_list_vector_stores(
@ -238,6 +259,13 @@ class VectorIORouter(VectorIO):
        metadata: dict[str, Any] | None = None,
    ) -> VectorStoreObject:
        logger.debug(f"VectorIORouter.openai_update_vector_store: {vector_store_id}")
+
+        # Check if provider_id is being changed (not supported)
+        if metadata and "provider_id" in metadata:
+            current_store = await self.routing_table.get_object_by_identifier("vector_store", vector_store_id)
+            if current_store and current_store.provider_id != metadata["provider_id"]:
+                raise ValueError("provider_id cannot be changed after vector store creation")
+
        provider = await self.routing_table.get_provider_impl(vector_store_id)
        return await provider.openai_update_vector_store(
            vector_store_id=vector_store_id,
@ -283,6 +311,8 @@ class VectorIORouter(VectorIO):
        chunking_strategy: VectorStoreChunkingStrategy | None = None,
    ) -> VectorStoreFileObject:
        logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
+        if chunking_strategy is None or chunking_strategy.type == "auto":
+            chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
        provider = await self.routing_table.get_provider_impl(vector_store_id)
        return await provider.openai_attach_file_to_vector_store(
            vector_store_id=vector_store_id,
@ -327,12 +357,19 @@ class VectorIORouter(VectorIO):
        self,
        vector_store_id: str,
        file_id: str,
-    ) -> VectorStoreFileContentsResponse:
-        logger.debug(f"VectorIORouter.openai_retrieve_vector_store_file_contents: {vector_store_id}, {file_id}")
-        provider = await self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_retrieve_vector_store_file_contents(
+        include_embeddings: bool | None = False,
+        include_metadata: bool | None = False,
+    ) -> VectorStoreFileContentResponse:
+        logger.debug(
+            f"VectorIORouter.openai_retrieve_vector_store_file_contents: {vector_store_id}, {file_id}, "
+            f"include_embeddings={include_embeddings}, include_metadata={include_metadata}"
+        )
+
+        return await self.routing_table.openai_retrieve_vector_store_file_contents(
            vector_store_id=vector_store_id,
            file_id=file_id,
+            include_embeddings=include_embeddings,
+            include_metadata=include_metadata,
        )

    async def openai_update_vector_store_file(
--- a/src/llama_stack/core/routing_tables/benchmarks.py
+++ b/src/llama_stack/core/routing_tables/benchmarks.py
@ -6,11 +6,11 @@

 from typing import Any

-from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
 from llama_stack.core.datatypes import (
    BenchmarkWithOwner,
 )
 from llama_stack.log import get_logger
+from llama_stack_api import Benchmark, Benchmarks, ListBenchmarksResponse

 from .common import CommonRoutingTableImpl

--- a/src/llama_stack/core/routing_tables/common.py
+++ b/src/llama_stack/core/routing_tables/common.py
@ -6,9 +6,6 @@

 from typing import Any

-from llama_stack.apis.common.errors import ModelNotFoundError
-from llama_stack.apis.models import Model
-from llama_stack.apis.resource import ResourceType
 from llama_stack.core.access_control.access_control import AccessDeniedError, is_action_allowed
 from llama_stack.core.access_control.datatypes import Action
 from llama_stack.core.datatypes import (
@ -21,7 +18,7 @@ from llama_stack.core.datatypes import (
 from llama_stack.core.request_headers import get_authenticated_user
 from llama_stack.core.store import DistributionRegistry
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api, RoutingTable
+from llama_stack_api import Api, Model, ModelNotFoundError, ResourceType, RoutingTable

 logger = get_logger(name=__name__, category="core::routing_tables")

--- a/src/llama_stack/core/routing_tables/datasets.py
+++ b/src/llama_stack/core/routing_tables/datasets.py
@ -7,22 +7,22 @@
 import uuid
 from typing import Any

-from llama_stack.apis.common.errors import DatasetNotFoundError
-from llama_stack.apis.datasets import (
+from llama_stack.core.datatypes import (
+    DatasetWithOwner,
+)
+from llama_stack.log import get_logger
+from llama_stack_api import (
    Dataset,
+    DatasetNotFoundError,
    DatasetPurpose,
    Datasets,
    DatasetType,
    DataSource,
    ListDatasetsResponse,
+    ResourceType,
    RowsDataSource,
    URIDataSource,
 )
-from llama_stack.apis.resource import ResourceType
-from llama_stack.core.datatypes import (
-    DatasetWithOwner,
-)
-from llama_stack.log import get_logger

 from .common import CommonRoutingTableImpl

--- a/src/llama_stack/core/routing_tables/models.py
+++ b/src/llama_stack/core/routing_tables/models.py
@ -7,8 +7,6 @@
 import time
 from typing import Any

-from llama_stack.apis.common.errors import ModelNotFoundError
-from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
 from llama_stack.core.datatypes import (
    ModelWithOwner,
    RegistryEntrySource,
@ -16,6 +14,15 @@ from llama_stack.core.datatypes import (
 from llama_stack.core.request_headers import PROVIDER_DATA_VAR, NeedsRequestProviderData
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
+from llama_stack_api import (
+    ListModelsResponse,
+    Model,
+    ModelNotFoundError,
+    Models,
+    ModelType,
+    OpenAIListModelsResponse,
+    OpenAIModel,
+)

 from .common import CommonRoutingTableImpl, lookup_model

--- a/src/llama_stack/core/routing_tables/scoring_functions.py
+++ b/src/llama_stack/core/routing_tables/scoring_functions.py
@ -4,18 +4,18 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import ParamType
-from llama_stack.apis.resource import ResourceType
-from llama_stack.apis.scoring_functions import (
-    ListScoringFunctionsResponse,
-    ScoringFn,
-    ScoringFnParams,
-    ScoringFunctions,
-)
 from llama_stack.core.datatypes import (
    ScoringFnWithOwner,
 )
 from llama_stack.log import get_logger
+from llama_stack_api import (
+    ListScoringFunctionsResponse,
+    ParamType,
+    ResourceType,
+    ScoringFn,
+    ScoringFnParams,
+    ScoringFunctions,
+)

 from .common import CommonRoutingTableImpl

--- a/src/llama_stack/core/routing_tables/shields.py
+++ b/src/llama_stack/core/routing_tables/shields.py
@ -6,12 +6,11 @@

 from typing import Any

-from llama_stack.apis.resource import ResourceType
-from llama_stack.apis.shields import ListShieldsResponse, Shield, Shields
 from llama_stack.core.datatypes import (
    ShieldWithOwner,
 )
 from llama_stack.log import get_logger
+from llama_stack_api import ListShieldsResponse, ResourceType, Shield, Shields

 from .common import CommonRoutingTableImpl

--- a/src/llama_stack/core/routing_tables/toolgroups.py
+++ b/src/llama_stack/core/routing_tables/toolgroups.py
@ -6,11 +6,17 @@

 from typing import Any

-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.errors import ToolGroupNotFoundError
-from llama_stack.apis.tools import ListToolDefsResponse, ListToolGroupsResponse, ToolDef, ToolGroup, ToolGroups
 from llama_stack.core.datatypes import AuthenticationRequiredError, ToolGroupWithOwner
 from llama_stack.log import get_logger
+from llama_stack_api import (
+    URL,
+    ListToolDefsResponse,
+    ListToolGroupsResponse,
+    ToolDef,
+    ToolGroup,
+    ToolGroupNotFoundError,
+    ToolGroups,
+)

 from .common import CommonRoutingTableImpl

@ -43,7 +49,9 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
            routing_key = self.tool_to_toolgroup[routing_key]
        return await super().get_provider_impl(routing_key, provider_id)

-    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolDefsResponse:
+    async def list_tools(
+        self, toolgroup_id: str | None = None, authorization: str | None = None
+    ) -> ListToolDefsResponse:
        if toolgroup_id:
            if group_id := parse_toolgroup_from_toolgroup_name_pair(toolgroup_id):
                toolgroup_id = group_id
@ -55,7 +63,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
        for toolgroup in toolgroups:
            if toolgroup.identifier not in self.toolgroups_to_tools:
                try:
-                    await self._index_tools(toolgroup)
+                    await self._index_tools(toolgroup, authorization=authorization)
                except AuthenticationRequiredError:
                    # Send authentication errors back to the client so it knows
                    # that it needs to supply credentials for remote MCP servers.
@ -70,9 +78,11 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):

        return ListToolDefsResponse(data=all_tools)

-    async def _index_tools(self, toolgroup: ToolGroup):
+    async def _index_tools(self, toolgroup: ToolGroup, authorization: str | None = None):
        provider_impl = await super().get_provider_impl(toolgroup.identifier, toolgroup.provider_id)
-        tooldefs_response = await provider_impl.list_runtime_tools(toolgroup.identifier, toolgroup.mcp_endpoint)
+        tooldefs_response = await provider_impl.list_runtime_tools(
+            toolgroup.identifier, toolgroup.mcp_endpoint, authorization=authorization
+        )

        tooldefs = tooldefs_response.data
        for t in tooldefs:
--- a/src/llama_stack/core/routing_tables/vector_stores.py
+++ b/src/llama_stack/core/routing_tables/vector_stores.py
@ -6,26 +6,27 @@

 from typing import Any

-from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
-from llama_stack.apis.models import ModelType
-from llama_stack.apis.resource import ResourceType
+from llama_stack.core.datatypes import (
+    VectorStoreWithOwner,
+)
+from llama_stack.log import get_logger

 # Removed VectorStores import to avoid exposing public API
-from llama_stack.apis.vector_io.vector_io import (
+from llama_stack_api import (
+    ModelNotFoundError,
+    ModelType,
+    ModelTypeError,
+    ResourceType,
    SearchRankingOptions,
    VectorStoreChunkingStrategy,
    VectorStoreDeleteResponse,
-    VectorStoreFileContentsResponse,
+    VectorStoreFileContentResponse,
    VectorStoreFileDeleteResponse,
    VectorStoreFileObject,
    VectorStoreFileStatus,
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
-from llama_stack.core.datatypes import (
-    VectorStoreWithOwner,
-)
-from llama_stack.log import get_logger

 from .common import CommonRoutingTableImpl, lookup_model

@ -195,12 +196,17 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        self,
        vector_store_id: str,
        file_id: str,
-    ) -> VectorStoreFileContentsResponse:
+        include_embeddings: bool | None = False,
+        include_metadata: bool | None = False,
+    ) -> VectorStoreFileContentResponse:
        await self.assert_action_allowed("read", "vector_store", vector_store_id)
+
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_retrieve_vector_store_file_contents(
            vector_store_id=vector_store_id,
            file_id=file_id,
+            include_embeddings=include_embeddings,
+            include_metadata=include_metadata,
        )

    async def openai_update_vector_store_file(
--- a/src/llama_stack/core/server/auth_providers.py
+++ b/src/llama_stack/core/server/auth_providers.py
@ -13,7 +13,6 @@ import httpx
 import jwt
 from pydantic import BaseModel, Field

-from llama_stack.apis.common.errors import TokenValidationError
 from llama_stack.core.datatypes import (
    AuthenticationConfig,
    CustomAuthConfig,
@ -23,6 +22,7 @@ from llama_stack.core.datatypes import (
    User,
 )
 from llama_stack.log import get_logger
+from llama_stack_api import TokenValidationError

 logger = get_logger(name=__name__, category="core::auth")

--- a/src/llama_stack/core/server/quota.py
+++ b/src/llama_stack/core/server/quota.py
@ -11,9 +11,9 @@ from datetime import UTC, datetime, timedelta
 from starlette.types import ASGIApp, Receive, Scope, Send

 from llama_stack.core.storage.datatypes import KVStoreReference, StorageBackendType
+from llama_stack.core.storage.kvstore.kvstore import _KVSTORE_BACKENDS, kvstore_impl
 from llama_stack.log import get_logger
-from llama_stack.providers.utils.kvstore.api import KVStore
-from llama_stack.providers.utils.kvstore.kvstore import _KVSTORE_BACKENDS, kvstore_impl
+from llama_stack_api.internal.kvstore import KVStore

 logger = get_logger(name=__name__, category="core::server")

--- a/src/llama_stack/core/server/routes.py
+++ b/src/llama_stack/core/server/routes.py
@ -12,10 +12,8 @@ from typing import Any
 from aiohttp import hdrs
 from starlette.routing import Route

-from llama_stack.apis.datatypes import Api, ExternalApiSpec
-from llama_stack.apis.tools import RAGToolRuntime, SpecialToolGroup
 from llama_stack.core.resolver import api_protocol_map
-from llama_stack.schema_utils import WebMethod
+from llama_stack_api import Api, ExternalApiSpec, WebMethod

 EndpointFunc = Callable[..., Any]
 PathParams = dict[str, str]
@ -25,33 +23,16 @@ RouteImpls = dict[str, PathImpl]
 RouteMatch = tuple[EndpointFunc, PathParams, str, WebMethod]


-def toolgroup_protocol_map():
-    return {
-        SpecialToolGroup.rag_tool: RAGToolRuntime,
-    }
-
-
 def get_all_api_routes(
    external_apis: dict[Api, ExternalApiSpec] | None = None,
 ) -> dict[Api, list[tuple[Route, WebMethod]]]:
    apis = {}

    protocols = api_protocol_map(external_apis)
-    toolgroup_protocols = toolgroup_protocol_map()
    for api, protocol in protocols.items():
        routes = []
        protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction)

-        # HACK ALERT
-        if api == Api.tool_runtime:
-            for tool_group in SpecialToolGroup:
-                sub_protocol = toolgroup_protocols[tool_group]
-                sub_protocol_methods = inspect.getmembers(sub_protocol, predicate=inspect.isfunction)
-                for name, method in sub_protocol_methods:
-                    if not hasattr(method, "__webmethod__"):
-                        continue
-                    protocol_methods.append((f"{tool_group.value}.{name}", method))
-
        for name, method in protocol_methods:
            # Get all webmethods for this method (supports multiple decorators)
            webmethods = getattr(method, "__webmethods__", [])
--- a/src/llama_stack/core/server/server.py
+++ b/src/llama_stack/core/server/server.py
@ -31,8 +31,6 @@ from fastapi.responses import JSONResponse, StreamingResponse
 from openai import BadRequestError
 from pydantic import BaseModel, ValidationError

-from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
-from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.core.access_control.access_control import AccessDeniedError
 from llama_stack.core.datatypes import (
    AuthenticationRequiredError,
@ -58,7 +56,7 @@ from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.log import LoggingConfig, get_logger, setup_logging
-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api, ConflictError, PaginatedResponse, ResourceNotFoundError

 from .auth import AuthenticationMiddleware
 from .quota import QuotaMiddleware
@ -526,8 +524,8 @@ def extract_path_params(route: str) -> list[str]:

 def remove_disabled_providers(obj):
    if isinstance(obj, dict):
-        keys = ["provider_id", "shield_id", "provider_model_id", "model_id"]
-        if any(k in obj and obj[k] in ("__disabled__", "", None) for k in keys):
+        # Filter out items where provider_id is explicitly disabled or empty
+        if "provider_id" in obj and obj["provider_id"] in ("__disabled__", "", None):
            return None
        return {k: v for k, v in ((k, remove_disabled_providers(v)) for k, v in obj.items()) if v is not None}
    elif isinstance(obj, list):
--- a/src/llama_stack/core/stack.py
+++ b/src/llama_stack/core/stack.py
@ -13,26 +13,6 @@ from typing import Any

 import yaml

-from llama_stack.apis.agents import Agents
-from llama_stack.apis.batches import Batches
-from llama_stack.apis.benchmarks import Benchmarks
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval import Eval
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.inspect import Inspect
-from llama_stack.apis.models import Models
-from llama_stack.apis.post_training import PostTraining
-from llama_stack.apis.prompts import Prompts
-from llama_stack.apis.providers import Providers
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFunctions
-from llama_stack.apis.shields import Shields
-from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl
 from llama_stack.core.datatypes import Provider, SafetyConfig, StackRunConfig, VectorStoresConfig
 from llama_stack.core.distribution import get_provider_registry
@ -54,7 +34,30 @@ from llama_stack.core.storage.datatypes import (
 from llama_stack.core.store.registry import create_dist_registry
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api
+from llama_stack_api import (
+    Agents,
+    Api,
+    Batches,
+    Benchmarks,
+    Conversations,
+    DatasetIO,
+    Datasets,
+    Eval,
+    Files,
+    Inference,
+    Inspect,
+    Models,
+    PostTraining,
+    Prompts,
+    Providers,
+    Safety,
+    Scoring,
+    ScoringFunctions,
+    Shields,
+    ToolGroups,
+    ToolRuntime,
+    VectorIO,
+)

 logger = get_logger(name=__name__, category="core")

@ -78,7 +81,6 @@ class LlamaStack(
    Inspect,
    ToolGroups,
    ToolRuntime,
-    RAGToolRuntime,
    Files,
    Prompts,
    Conversations,
@ -383,8 +385,8 @@ def _initialize_storage(run_config: StackRunConfig):
        else:
            raise ValueError(f"Unknown storage backend type: {type}")

-    from llama_stack.providers.utils.kvstore.kvstore import register_kvstore_backends
-    from llama_stack.providers.utils.sqlstore.sqlstore import register_sqlstore_backends
+    from llama_stack.core.storage.kvstore.kvstore import register_kvstore_backends
+    from llama_stack.core.storage.sqlstore.sqlstore import register_sqlstore_backends

    register_kvstore_backends(kv_backends)
    register_sqlstore_backends(sql_backends)
--- a/src/llama_stack/core/storage/datatypes.py
+++ b/src/llama_stack/core/storage/datatypes.py
@ -12,6 +12,8 @@ from typing import Annotated, Literal

 from pydantic import BaseModel, Field, field_validator

+from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
+

 class StorageBackendType(StrEnum):
    KV_REDIS = "kv_redis"
@ -256,15 +258,24 @@ class ResponsesStoreReference(InferenceStoreReference):

 class ServerStoresConfig(BaseModel):
    metadata: KVStoreReference | None = Field(
-        default=None,
+        default=KVStoreReference(
+            backend="kv_default",
+            namespace="registry",
+        ),
        description="Metadata store configuration (uses KV backend)",
    )
    inference: InferenceStoreReference | None = Field(
-        default=None,
+        default=InferenceStoreReference(
+            backend="sql_default",
+            table_name="inference_store",
+        ),
        description="Inference store configuration (uses SQL backend)",
    )
    conversations: SqlStoreReference | None = Field(
-        default=None,
+        default=SqlStoreReference(
+            backend="sql_default",
+            table_name="openai_conversations",
+        ),
        description="Conversations store configuration (uses SQL backend)",
    )
    responses: ResponsesStoreReference | None = Field(
@ -272,13 +283,21 @@ class ServerStoresConfig(BaseModel):
        description="Responses store configuration (uses SQL backend)",
    )
    prompts: KVStoreReference | None = Field(
-        default=None,
+        default=KVStoreReference(backend="kv_default", namespace="prompts"),
        description="Prompts store configuration (uses KV backend)",
    )


 class StorageConfig(BaseModel):
    backends: dict[str, StorageBackendConfig] = Field(
+        default={
+            "kv_default": SqliteKVStoreConfig(
+                db_path=f"${{env.SQLITE_STORE_DIR:={DISTRIBS_BASE_DIR}}}/kvstore.db",
+            ),
+            "sql_default": SqliteSqlStoreConfig(
+                db_path=f"${{env.SQLITE_STORE_DIR:={DISTRIBS_BASE_DIR}}}/sql_store.db",
+            ),
+        },
        description="Named backend configurations (e.g., 'default', 'cache')",
    )
    stores: ServerStoresConfig = Field(
--- a/src/llama_stack/providers/utils/kvstore/init.py
+++ b/src/llama_stack/providers/utils/kvstore/init.py
@ -4,4 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from llama_stack_api.internal.kvstore import KVStore as KVStore
+
 from .kvstore import *  # noqa: F401, F403
--- a/src/llama_stack/providers/utils/kvstore/config.py
+++ b/src/llama_stack/providers/utils/kvstore/config.py
--- a/src/llama_stack/providers/utils/kvstore/kvstore.py
+++ b/src/llama_stack/providers/utils/kvstore/kvstore.py
@ -11,10 +11,21 @@

 from __future__ import annotations

-from llama_stack.core.storage.datatypes import KVStoreReference, StorageBackendConfig, StorageBackendType
+import asyncio
+from collections import defaultdict
+from datetime import datetime
+from typing import cast

-from .api import KVStore
-from .config import KVStoreConfig
+from llama_stack.core.storage.datatypes import KVStoreReference, StorageBackendConfig
+from llama_stack_api.internal.kvstore import KVStore
+
+from .config import (
+    KVStoreConfig,
+    MongoDBKVStoreConfig,
+    PostgresKVStoreConfig,
+    RedisKVStoreConfig,
+    SqliteKVStoreConfig,
+)


 def kvstore_dependencies():
@ -30,7 +41,7 @@ def kvstore_dependencies():

 class InmemoryKVStoreImpl(KVStore):
    def __init__(self):
-        self._store = {}
+        self._store: dict[str, str] = {}

    async def initialize(self) -> None:
        pass
@ -38,7 +49,7 @@ class InmemoryKVStoreImpl(KVStore):
    async def get(self, key: str) -> str | None:
        return self._store.get(key)

-    async def set(self, key: str, value: str) -> None:
+    async def set(self, key: str, value: str, expiration: datetime | None = None) -> None:
        self._store[key] = value

    async def values_in_range(self, start_key: str, end_key: str) -> list[str]:
@ -53,45 +64,65 @@ class InmemoryKVStoreImpl(KVStore):


 _KVSTORE_BACKENDS: dict[str, KVStoreConfig] = {}
+_KVSTORE_INSTANCES: dict[tuple[str, str], KVStore] = {}
+_KVSTORE_LOCKS: defaultdict[tuple[str, str], asyncio.Lock] = defaultdict(asyncio.Lock)


 def register_kvstore_backends(backends: dict[str, StorageBackendConfig]) -> None:
    """Register the set of available KV store backends for reference resolution."""
    global _KVSTORE_BACKENDS
+    global _KVSTORE_INSTANCES
+    global _KVSTORE_LOCKS

    _KVSTORE_BACKENDS.clear()
+    _KVSTORE_INSTANCES.clear()
+    _KVSTORE_LOCKS.clear()
    for name, cfg in backends.items():
-        _KVSTORE_BACKENDS[name] = cfg
+        typed_cfg = cast(KVStoreConfig, cfg)
+        _KVSTORE_BACKENDS[name] = typed_cfg


 async def kvstore_impl(reference: KVStoreReference) -> KVStore:
    backend_name = reference.backend
+    cache_key = (backend_name, reference.namespace)
+
+    existing = _KVSTORE_INSTANCES.get(cache_key)
+    if existing:
+        return existing

    backend_config = _KVSTORE_BACKENDS.get(backend_name)
    if backend_config is None:
        raise ValueError(f"Unknown KVStore backend '{backend_name}'. Registered backends: {sorted(_KVSTORE_BACKENDS)}")

-    config = backend_config.model_copy()
-    config.namespace = reference.namespace
+    lock = _KVSTORE_LOCKS[cache_key]
+    async with lock:
+        existing = _KVSTORE_INSTANCES.get(cache_key)
+        if existing:
+            return existing

-    if config.type == StorageBackendType.KV_REDIS.value:
-        from .redis import RedisKVStoreImpl
+        config = backend_config.model_copy()
+        config.namespace = reference.namespace

-        impl = RedisKVStoreImpl(config)
-    elif config.type == StorageBackendType.KV_SQLITE.value:
-        from .sqlite import SqliteKVStoreImpl
+        impl: KVStore
+        if isinstance(config, RedisKVStoreConfig):
+            from .redis import RedisKVStoreImpl

-        impl = SqliteKVStoreImpl(config)
-    elif config.type == StorageBackendType.KV_POSTGRES.value:
-        from .postgres import PostgresKVStoreImpl
+            impl = RedisKVStoreImpl(config)
+        elif isinstance(config, SqliteKVStoreConfig):
+            from .sqlite import SqliteKVStoreImpl

-        impl = PostgresKVStoreImpl(config)
-    elif config.type == StorageBackendType.KV_MONGODB.value:
-        from .mongodb import MongoDBKVStoreImpl
+            impl = SqliteKVStoreImpl(config)
+        elif isinstance(config, PostgresKVStoreConfig):
+            from .postgres import PostgresKVStoreImpl

-        impl = MongoDBKVStoreImpl(config)
-    else:
-        raise ValueError(f"Unknown kvstore type {config.type}")
+            impl = PostgresKVStoreImpl(config)
+        elif isinstance(config, MongoDBKVStoreConfig):
+            from .mongodb import MongoDBKVStoreImpl

-    await impl.initialize()
-    return impl
+            impl = MongoDBKVStoreImpl(config)
+        else:
+            raise ValueError(f"Unknown kvstore type {config.type}")
+
+        await impl.initialize()
+        _KVSTORE_INSTANCES[cache_key] = impl
+        return impl
--- a/src/llama_stack/providers/utils/kvstore/mongodb/init.py
+++ b/src/llama_stack/providers/utils/kvstore/mongodb/init.py
--- a/src/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
+++ b/src/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
@ -9,8 +9,8 @@ from datetime import datetime
 from pymongo import AsyncMongoClient
 from pymongo.asynchronous.collection import AsyncCollection

+from llama_stack.core.storage.kvstore import KVStore
 from llama_stack.log import get_logger
-from llama_stack.providers.utils.kvstore import KVStore

 from ..config import MongoDBKVStoreConfig

--- a/src/llama_stack/providers/utils/kvstore/postgres/init.py
+++ b/src/llama_stack/providers/utils/kvstore/postgres/init.py
--- a/Show more
+++ b/Show more