feat: split API and provider specs into separate llama-stack-api pkg (#3895)

# What does this PR do? Extract API definitions and provider specifications into a standalone llama-stack-api package that can be published to PyPI independently of the main llama-stack server. see: https://github.com/llamastack/llama-stack/pull/2978 and https://github.com/llamastack/llama-stack/pull/2978#issuecomment-3145115942 Motivation External providers currently import from llama-stack, which overrides the installed version and causes dependency conflicts. This separation allows external providers to: - Install only the type definitions they need without server dependencies - Avoid version conflicts with the installed llama-stack package - Be versioned and released independently This enables us to re-enable external provider module tests that were previously blocked by these import conflicts. Changes - Created llama-stack-api package with minimal dependencies (pydantic, jsonschema) - Moved APIs, providers datatypes, strong_typing, and schema_utils - Updated all imports from llama_stack.* to llama_stack_api.* - Configured local editable install for development workflow - Updated linting and type-checking configuration for both packages Next Steps - Publish llama-stack-api to PyPI - Update external provider dependencies - Re-enable external provider module tests Pre-cursor PRs to this one: - #4093 - #3954 - #4064 These PRs moved key pieces _out_ of the Api pkg, limiting the scope of change here. relates to #3237 ## Test Plan Package builds successfully and can be imported independently. All pre-commit hooks pass with expected exclusions maintained. --------- Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-12-03 09:53:45 +00:00 · 2025-11-13 14:51:17 -05:00 · 2025-11-13 14:51:17 -05:00 · 840ad75fe9
commit 840ad75fe9
parent ceb716b9a0
358 changed files with 2337 additions and 1424 deletions
--- a/src/llama_stack/apis/init.py
+++ b/src/llama_stack/apis/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/apis/agents/init.py
+++ b/src/llama_stack/apis/agents/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .agents import *
--- a/src/llama_stack/apis/agents/agents.py
+++ b/src/llama_stack/apis/agents/agents.py
@ -1,153 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from collections.abc import AsyncIterator
-from typing import Annotated, Protocol, runtime_checkable
-
-from pydantic import BaseModel
-
-from llama_stack.apis.common.responses import Order
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import ExtraBodyField, json_schema_type, webmethod
-
-from .openai_responses import (
-    ListOpenAIResponseInputItem,
-    ListOpenAIResponseObject,
-    OpenAIDeleteResponseObject,
-    OpenAIResponseInput,
-    OpenAIResponseInputTool,
-    OpenAIResponseObject,
-    OpenAIResponseObjectStream,
-    OpenAIResponsePrompt,
-    OpenAIResponseText,
-)
-
-
-@json_schema_type
-class ResponseGuardrailSpec(BaseModel):
-    """Specification for a guardrail to apply during response generation.
-
-    :param type: The type/identifier of the guardrail.
-    """
-
-    type: str
-    # TODO: more fields to be added for guardrail configuration
-
-
-ResponseGuardrail = str | ResponseGuardrailSpec
-
-
-@runtime_checkable
-class Agents(Protocol):
-    """Agents
-
-    APIs for creating and interacting with agentic systems."""
-
-    # We situate the OpenAI Responses API in the Agents API just like we did things
-    # for Inference. The Responses API, in its intent, serves the same purpose as
-    # the Agents API above -- it is essentially a lightweight "agentic loop" with
-    # integrated tool calling.
-    #
-    # Both of these APIs are inherently stateful.
-
-    @webmethod(route="/responses/{response_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_openai_response(
-        self,
-        response_id: str,
-    ) -> OpenAIResponseObject:
-        """Get a model response.
-
-        :param response_id: The ID of the OpenAI response to retrieve.
-        :returns: An OpenAIResponseObject.
-        """
-        ...
-
-    @webmethod(route="/responses", method="POST", level=LLAMA_STACK_API_V1)
-    async def create_openai_response(
-        self,
-        input: str | list[OpenAIResponseInput],
-        model: str,
-        prompt: OpenAIResponsePrompt | None = None,
-        instructions: str | None = None,
-        previous_response_id: str | None = None,
-        conversation: str | None = None,
-        store: bool | None = True,
-        stream: bool | None = False,
-        temperature: float | None = None,
-        text: OpenAIResponseText | None = None,
-        tools: list[OpenAIResponseInputTool] | None = None,
-        include: list[str] | None = None,
-        max_infer_iters: int | None = 10,  # this is an extension to the OpenAI API
-        guardrails: Annotated[
-            list[ResponseGuardrail] | None,
-            ExtraBodyField(
-                "List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
-            ),
-        ] = None,
-        max_tool_calls: int | None = None,
-    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
-        """Create a model response.
-
-        :param input: Input message(s) to create the response.
-        :param model: The underlying LLM used for completions.
-        :param prompt: (Optional) Prompt object with ID, version, and variables.
-        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
-        :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
-        :param include: (Optional) Additional fields to include in the response.
-        :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
-        :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response.
-        :returns: An OpenAIResponseObject.
-        """
-        ...
-
-    @webmethod(route="/responses", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_openai_responses(
-        self,
-        after: str | None = None,
-        limit: int | None = 50,
-        model: str | None = None,
-        order: Order | None = Order.desc,
-    ) -> ListOpenAIResponseObject:
-        """List all responses.
-
-        :param after: The ID of the last response to return.
-        :param limit: The number of responses to return.
-        :param model: The model to filter responses by.
-        :param order: The order to sort responses by when sorted by created_at ('asc' or 'desc').
-        :returns: A ListOpenAIResponseObject.
-        """
-        ...
-
-    @webmethod(route="/responses/{response_id}/input_items", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_openai_response_input_items(
-        self,
-        response_id: str,
-        after: str | None = None,
-        before: str | None = None,
-        include: list[str] | None = None,
-        limit: int | None = 20,
-        order: Order | None = Order.desc,
-    ) -> ListOpenAIResponseInputItem:
-        """List input items.
-
-        :param response_id: The ID of the response to retrieve input items for.
-        :param after: An item ID to list items after, used for pagination.
-        :param before: An item ID to list items before, used for pagination.
-        :param include: Additional fields to include in the response.
-        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
-        :param order: The order to return the input items in. Default is desc.
-        :returns: An ListOpenAIResponseInputItem.
-        """
-        ...
-
-    @webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
-        """Delete a response.
-
-        :param response_id: The ID of the OpenAI response to delete.
-        :returns: An OpenAIDeleteResponseObject
-        """
-        ...
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
--- a/src/llama_stack/apis/batches/init.py
+++ b/src/llama_stack/apis/batches/init.py
@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .batches import Batches, BatchObject, ListBatchesResponse
-
-__all__ = ["Batches", "BatchObject", "ListBatchesResponse"]
--- a/src/llama_stack/apis/batches/batches.py
+++ b/src/llama_stack/apis/batches/batches.py
@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-try:
-    from openai.types import Batch as BatchObject
-except ImportError as e:
-    raise ImportError("OpenAI package is required for batches API. Please install it with: pip install openai") from e
-
-
-@json_schema_type
-class ListBatchesResponse(BaseModel):
-    """Response containing a list of batch objects."""
-
-    object: Literal["list"] = "list"
-    data: list[BatchObject] = Field(..., description="List of batch objects")
-    first_id: str | None = Field(default=None, description="ID of the first batch in the list")
-    last_id: str | None = Field(default=None, description="ID of the last batch in the list")
-    has_more: bool = Field(default=False, description="Whether there are more batches available")
-
-
-@runtime_checkable
-class Batches(Protocol):
-    """
-    The Batches API enables efficient processing of multiple requests in a single operation,
-    particularly useful for processing large datasets, batch evaluation workflows, and
-    cost-effective inference at scale.
-
-    The API is designed to allow use of openai client libraries for seamless integration.
-
-    This API provides the following extensions:
-     - idempotent batch creation
-
-    Note: This API is currently under active development and may undergo changes.
-    """
-
-    @webmethod(route="/batches", method="POST", level=LLAMA_STACK_API_V1)
-    async def create_batch(
-        self,
-        input_file_id: str,
-        endpoint: str,
-        completion_window: Literal["24h"],
-        metadata: dict[str, str] | None = None,
-        idempotency_key: str | None = None,
-    ) -> BatchObject:
-        """Create a new batch for processing multiple API requests.
-
-        :param input_file_id: The ID of an uploaded file containing requests for the batch.
-        :param endpoint: The endpoint to be used for all requests in the batch.
-        :param completion_window: The time window within which the batch should be processed.
-        :param metadata: Optional metadata for the batch.
-        :param idempotency_key: Optional idempotency key. When provided, enables idempotent behavior.
-        :returns: The created batch object.
-        """
-        ...
-
-    @webmethod(route="/batches/{batch_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def retrieve_batch(self, batch_id: str) -> BatchObject:
-        """Retrieve information about a specific batch.
-
-        :param batch_id: The ID of the batch to retrieve.
-        :returns: The batch object.
-        """
-        ...
-
-    @webmethod(route="/batches/{batch_id}/cancel", method="POST", level=LLAMA_STACK_API_V1)
-    async def cancel_batch(self, batch_id: str) -> BatchObject:
-        """Cancel a batch that is in progress.
-
-        :param batch_id: The ID of the batch to cancel.
-        :returns: The updated batch object.
-        """
-        ...
-
-    @webmethod(route="/batches", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_batches(
-        self,
-        after: str | None = None,
-        limit: int = 20,
-    ) -> ListBatchesResponse:
-        """List all batches for the current user.
-
-        :param after: A cursor for pagination; returns batches after this batch ID.
-        :param limit: Number of batches to return (default 20, max 100).
-        :returns: A list of batch objects.
-        """
-        ...
--- a/src/llama_stack/apis/benchmarks/init.py
+++ b/src/llama_stack/apis/benchmarks/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .benchmarks import *
--- a/src/llama_stack/apis/benchmarks/benchmarks.py
+++ b/src/llama_stack/apis/benchmarks/benchmarks.py
@ -1,104 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any, Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-class CommonBenchmarkFields(BaseModel):
-    dataset_id: str
-    scoring_functions: list[str]
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Metadata for this evaluation task",
-    )
-
-
-@json_schema_type
-class Benchmark(CommonBenchmarkFields, Resource):
-    """A benchmark resource for evaluating model performance.
-
-    :param dataset_id: Identifier of the dataset to use for the benchmark evaluation
-    :param scoring_functions: List of scoring function identifiers to apply during evaluation
-    :param metadata: Metadata for this evaluation task
-    :param type: The resource type, always benchmark
-    """
-
-    type: Literal[ResourceType.benchmark] = ResourceType.benchmark
-
-    @property
-    def benchmark_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_benchmark_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class BenchmarkInput(CommonBenchmarkFields, BaseModel):
-    benchmark_id: str
-    provider_id: str | None = None
-    provider_benchmark_id: str | None = None
-
-
-class ListBenchmarksResponse(BaseModel):
-    data: list[Benchmark]
-
-
-@runtime_checkable
-class Benchmarks(Protocol):
-    @webmethod(route="/eval/benchmarks", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def list_benchmarks(self) -> ListBenchmarksResponse:
-        """List all benchmarks.
-
-        :returns: A ListBenchmarksResponse.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def get_benchmark(
-        self,
-        benchmark_id: str,
-    ) -> Benchmark:
-        """Get a benchmark by its ID.
-
-        :param benchmark_id: The ID of the benchmark to get.
-        :returns: A Benchmark.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1ALPHA, deprecated=True)
-    async def register_benchmark(
-        self,
-        benchmark_id: str,
-        dataset_id: str,
-        scoring_functions: list[str],
-        provider_benchmark_id: str | None = None,
-        provider_id: str | None = None,
-        metadata: dict[str, Any] | None = None,
-    ) -> None:
-        """Register a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to register.
-        :param dataset_id: The ID of the dataset to use for the benchmark.
-        :param scoring_functions: The scoring functions to use for the benchmark.
-        :param provider_benchmark_id: The ID of the provider benchmark to use for the benchmark.
-        :param provider_id: The ID of the provider to use for the benchmark.
-        :param metadata: The metadata to use for the benchmark.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA, deprecated=True)
-    async def unregister_benchmark(self, benchmark_id: str) -> None:
-        """Unregister a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to unregister.
-        """
-        ...
--- a/src/llama_stack/apis/common/init.py
+++ b/src/llama_stack/apis/common/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/apis/common/content_types.py
+++ b/src/llama_stack/apis/common/content_types.py
@ -1,143 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Annotated, Literal
-
-from pydantic import BaseModel, Field, model_validator
-
-from llama_stack.models.llama.datatypes import ToolCall
-from llama_stack.schema_utils import json_schema_type, register_schema
-
-
-@json_schema_type
-class URL(BaseModel):
-    """A URL reference to external content.
-
-    :param uri: The URL string pointing to the resource
-    """
-
-    uri: str
-
-
-class _URLOrData(BaseModel):
-    """
-    A URL or a base64 encoded string
-
-    :param url: A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits.
-    :param data: base64 encoded image data as string
-    """
-
-    url: URL | None = None
-    # data is a base64 encoded string, hint with contentEncoding=base64
-    data: str | None = Field(default=None, json_schema_extra={"contentEncoding": "base64"})
-
-    @model_validator(mode="before")
-    @classmethod
-    def validator(cls, values):
-        if isinstance(values, dict):
-            return values
-        return {"url": values}
-
-
-@json_schema_type
-class ImageContentItem(BaseModel):
-    """A image content item
-
-    :param type: Discriminator type of the content item. Always "image"
-    :param image: Image as a base64 encoded string or an URL
-    """
-
-    type: Literal["image"] = "image"
-    image: _URLOrData
-
-
-@json_schema_type
-class TextContentItem(BaseModel):
-    """A text content item
-
-    :param type: Discriminator type of the content item. Always "text"
-    :param text: Text content
-    """
-
-    type: Literal["text"] = "text"
-    text: str
-
-
-# other modalities can be added here
-InterleavedContentItem = Annotated[
-    ImageContentItem | TextContentItem,
-    Field(discriminator="type"),
-]
-register_schema(InterleavedContentItem, name="InterleavedContentItem")
-
-# accept a single "str" as a special case since it is common
-InterleavedContent = str | InterleavedContentItem | list[InterleavedContentItem]
-register_schema(InterleavedContent, name="InterleavedContent")
-
-
-@json_schema_type
-class TextDelta(BaseModel):
-    """A text content delta for streaming responses.
-
-    :param type: Discriminator type of the delta. Always "text"
-    :param text: The incremental text content
-    """
-
-    type: Literal["text"] = "text"
-    text: str
-
-
-@json_schema_type
-class ImageDelta(BaseModel):
-    """An image content delta for streaming responses.
-
-    :param type: Discriminator type of the delta. Always "image"
-    :param image: The incremental image data as bytes
-    """
-
-    type: Literal["image"] = "image"
-    image: bytes
-
-
-class ToolCallParseStatus(Enum):
-    """Status of tool call parsing during streaming.
-    :cvar started: Tool call parsing has begun
-    :cvar in_progress: Tool call parsing is ongoing
-    :cvar failed: Tool call parsing failed
-    :cvar succeeded: Tool call parsing completed successfully
-    """
-
-    started = "started"
-    in_progress = "in_progress"
-    failed = "failed"
-    succeeded = "succeeded"
-
-
-@json_schema_type
-class ToolCallDelta(BaseModel):
-    """A tool call content delta for streaming responses.
-
-    :param type: Discriminator type of the delta. Always "tool_call"
-    :param tool_call: Either an in-progress tool call string or the final parsed tool call
-    :param parse_status: Current parsing status of the tool call
-    """
-
-    type: Literal["tool_call"] = "tool_call"
-
-    # you either send an in-progress tool call so the client can stream a long
-    # code generation or you send the final parsed tool call at the end of the
-    # stream
-    tool_call: str | ToolCall
-    parse_status: ToolCallParseStatus
-
-
-# streaming completions send a stream of ContentDeltas
-ContentDelta = Annotated[
-    TextDelta | ImageDelta | ToolCallDelta,
-    Field(discriminator="type"),
-]
-register_schema(ContentDelta, name="ContentDelta")
--- a/src/llama_stack/apis/common/errors.py
+++ b/src/llama_stack/apis/common/errors.py
@ -1,95 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Custom Llama Stack Exception classes should follow the following schema
-#   1. All classes should inherit from an existing Built-In Exception class: https://docs.python.org/3/library/exceptions.html
-#   2. All classes should have a custom error message with the goal of informing the Llama Stack user specifically
-#   3. All classes should propogate the inherited __init__ function otherwise via 'super().__init__(message)'
-
-
-class ResourceNotFoundError(ValueError):
-    """generic exception for a missing Llama Stack resource"""
-
-    def __init__(self, resource_name: str, resource_type: str, client_list: str) -> None:
-        message = (
-            f"{resource_type} '{resource_name}' not found. Use '{client_list}' to list available {resource_type}s."
-        )
-        super().__init__(message)
-
-
-class UnsupportedModelError(ValueError):
-    """raised when model is not present in the list of supported models"""
-
-    def __init__(self, model_name: str, supported_models_list: list[str]):
-        message = f"'{model_name}' model is not supported. Supported models are: {', '.join(supported_models_list)}"
-        super().__init__(message)
-
-
-class ModelNotFoundError(ResourceNotFoundError):
-    """raised when Llama Stack cannot find a referenced model"""
-
-    def __init__(self, model_name: str) -> None:
-        super().__init__(model_name, "Model", "client.models.list()")
-
-
-class VectorStoreNotFoundError(ResourceNotFoundError):
-    """raised when Llama Stack cannot find a referenced vector store"""
-
-    def __init__(self, vector_store_name: str) -> None:
-        super().__init__(vector_store_name, "Vector Store", "client.vector_dbs.list()")
-
-
-class DatasetNotFoundError(ResourceNotFoundError):
-    """raised when Llama Stack cannot find a referenced dataset"""
-
-    def __init__(self, dataset_name: str) -> None:
-        super().__init__(dataset_name, "Dataset", "client.datasets.list()")
-
-
-class ToolGroupNotFoundError(ResourceNotFoundError):
-    """raised when Llama Stack cannot find a referenced tool group"""
-
-    def __init__(self, toolgroup_name: str) -> None:
-        super().__init__(toolgroup_name, "Tool Group", "client.toolgroups.list()")
-
-
-class ModelTypeError(TypeError):
-    """raised when a model is present but not the correct type"""
-
-    def __init__(self, model_name: str, model_type: str, expected_model_type: str) -> None:
-        message = (
-            f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
-        )
-        super().__init__(message)
-
-
-class ConflictError(ValueError):
-    """raised when an operation cannot be performed due to a conflict with the current state"""
-
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
-
-
-class TokenValidationError(ValueError):
-    """raised when token validation fails during authentication"""
-
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
-
-
-class ConversationNotFoundError(ResourceNotFoundError):
-    """raised when Llama Stack cannot find a referenced conversation"""
-
-    def __init__(self, conversation_id: str) -> None:
-        super().__init__(conversation_id, "Conversation", "client.conversations.list()")
-
-
-class InvalidConversationIdError(ValueError):
-    """raised when a conversation ID has an invalid format"""
-
-    def __init__(self, conversation_id: str) -> None:
-        message = f"Invalid conversation ID '{conversation_id}'. Expected an ID that begins with 'conv_'."
-        super().__init__(message)
--- a/src/llama_stack/apis/common/job_types.py
+++ b/src/llama_stack/apis/common/job_types.py
@ -1,38 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from enum import Enum
-
-from pydantic import BaseModel
-
-from llama_stack.schema_utils import json_schema_type
-
-
-class JobStatus(Enum):
-    """Status of a job execution.
-    :cvar completed: Job has finished successfully
-    :cvar in_progress: Job is currently running
-    :cvar failed: Job has failed during execution
-    :cvar scheduled: Job is scheduled but not yet started
-    :cvar cancelled: Job was cancelled before completion
-    """
-
-    completed = "completed"
-    in_progress = "in_progress"
-    failed = "failed"
-    scheduled = "scheduled"
-    cancelled = "cancelled"
-
-
-@json_schema_type
-class Job(BaseModel):
-    """A job execution instance with status tracking.
-
-    :param job_id: Unique identifier for the job
-    :param status: Current execution status of the job
-    """
-
-    job_id: str
-    status: JobStatus
--- a/src/llama_stack/apis/common/responses.py
+++ b/src/llama_stack/apis/common/responses.py
@ -1,77 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any
-
-from pydantic import BaseModel
-
-from llama_stack.schema_utils import json_schema_type
-
-
-class Order(Enum):
-    """Sort order for paginated responses.
-    :cvar asc: Ascending order
-    :cvar desc: Descending order
-    """
-
-    asc = "asc"
-    desc = "desc"
-
-
-@json_schema_type
-class PaginatedResponse(BaseModel):
-    """A generic paginated response that follows a simple format.
-
-    :param data: The list of items for the current page
-    :param has_more: Whether there are more items available after this set
-    :param url: The URL for accessing this list
-    """
-
-    data: list[dict[str, Any]]
-    has_more: bool
-    url: str | None = None
-
-
-# This is a short term solution to allow inference API to return metrics
-# The ideal way to do this is to have a way for all response types to include metrics
-# and all metric events logged to the telemetry API to be included with the response
-# To do this, we will need to augment all response types with a metrics field.
-# We have hit a blocker from stainless SDK that prevents us from doing this.
-# The blocker is that if we were to augment the response types that have a data field
-# in them like so
-# class ListModelsResponse(BaseModel):
-# metrics: Optional[List[MetricEvent]] = None
-# data: List[Models]
-# ...
-# The client SDK will need to access the data by using a .data field, which is not
-# ergonomic. Stainless SDK does support unwrapping the response type, but it
-# requires that the response type to only have a single field.
-
-# We will need a way in the client SDK to signal that the metrics are needed
-# and if they are needed, the client SDK has to return the full response type
-# without unwrapping it.
-
-
-@json_schema_type
-class MetricInResponse(BaseModel):
-    """A metric value included in API responses.
-    :param metric: The name of the metric
-    :param value: The numeric value of the metric
-    :param unit: (Optional) The unit of measurement for the metric value
-    """
-
-    metric: str
-    value: int | float
-    unit: str | None = None
-
-
-class MetricResponseMixin(BaseModel):
-    """Mixin class for API responses that can include metrics.
-    :param metrics: (Optional) List of metrics associated with the API response
-    """
-
-    metrics: list[MetricInResponse] | None = None
--- a/src/llama_stack/apis/common/tracing.py
+++ b/src/llama_stack/apis/common/tracing.py
@ -1,22 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-def telemetry_traceable(cls):
-    """
-    Mark a protocol for automatic tracing when telemetry is enabled.
-
-    This is a metadata-only decorator with no dependencies on core.
-    Actual tracing is applied by core routers at runtime if telemetry is enabled.
-
-    Usage:
-        @runtime_checkable
-        @telemetry_traceable
-        class MyProtocol(Protocol):
-            ...
-    """
-    cls.__marked_for_tracing__ = True
-    return cls
--- a/src/llama_stack/apis/common/training_types.py
+++ b/src/llama_stack/apis/common/training_types.py
@ -1,47 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from datetime import datetime
-
-from pydantic import BaseModel
-
-from llama_stack.schema_utils import json_schema_type
-
-
-@json_schema_type
-class PostTrainingMetric(BaseModel):
-    """Training metrics captured during post-training jobs.
-
-    :param epoch: Training epoch number
-    :param train_loss: Loss value on the training dataset
-    :param validation_loss: Loss value on the validation dataset
-    :param perplexity: Perplexity metric indicating model confidence
-    """
-
-    epoch: int
-    train_loss: float
-    validation_loss: float
-    perplexity: float
-
-
-@json_schema_type
-class Checkpoint(BaseModel):
-    """Checkpoint created during training runs.
-
-    :param identifier: Unique identifier for the checkpoint
-    :param created_at: Timestamp when the checkpoint was created
-    :param epoch: Training epoch when the checkpoint was saved
-    :param post_training_job_id: Identifier of the training job that created this checkpoint
-    :param path: File system path where the checkpoint is stored
-    :param training_metrics: (Optional) Training metrics associated with this checkpoint
-    """
-
-    identifier: str
-    created_at: datetime
-    epoch: int
-    post_training_job_id: str
-    path: str
-    training_metrics: PostTrainingMetric | None = None
--- a/src/llama_stack/apis/common/type_system.py
+++ b/src/llama_stack/apis/common/type_system.py
@ -1,146 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Annotated, Literal
-
-from pydantic import BaseModel, Field
-
-from llama_stack.schema_utils import json_schema_type, register_schema
-
-
-@json_schema_type
-class StringType(BaseModel):
-    """Parameter type for string values.
-
-    :param type: Discriminator type. Always "string"
-    """
-
-    type: Literal["string"] = "string"
-
-
-@json_schema_type
-class NumberType(BaseModel):
-    """Parameter type for numeric values.
-
-    :param type: Discriminator type. Always "number"
-    """
-
-    type: Literal["number"] = "number"
-
-
-@json_schema_type
-class BooleanType(BaseModel):
-    """Parameter type for boolean values.
-
-    :param type: Discriminator type. Always "boolean"
-    """
-
-    type: Literal["boolean"] = "boolean"
-
-
-@json_schema_type
-class ArrayType(BaseModel):
-    """Parameter type for array values.
-
-    :param type: Discriminator type. Always "array"
-    """
-
-    type: Literal["array"] = "array"
-
-
-@json_schema_type
-class ObjectType(BaseModel):
-    """Parameter type for object values.
-
-    :param type: Discriminator type. Always "object"
-    """
-
-    type: Literal["object"] = "object"
-
-
-@json_schema_type
-class JsonType(BaseModel):
-    """Parameter type for JSON values.
-
-    :param type: Discriminator type. Always "json"
-    """
-
-    type: Literal["json"] = "json"
-
-
-@json_schema_type
-class UnionType(BaseModel):
-    """Parameter type for union values.
-
-    :param type: Discriminator type. Always "union"
-    """
-
-    type: Literal["union"] = "union"
-
-
-@json_schema_type
-class ChatCompletionInputType(BaseModel):
-    """Parameter type for chat completion input.
-
-    :param type: Discriminator type. Always "chat_completion_input"
-    """
-
-    # expects List[Message] for messages
-    type: Literal["chat_completion_input"] = "chat_completion_input"
-
-
-@json_schema_type
-class CompletionInputType(BaseModel):
-    """Parameter type for completion input.
-
-    :param type: Discriminator type. Always "completion_input"
-    """
-
-    # expects InterleavedTextMedia for content
-    type: Literal["completion_input"] = "completion_input"
-
-
-@json_schema_type
-class DialogType(BaseModel):
-    """Parameter type for dialog data with semantic output labels.
-
-    :param type: Discriminator type. Always "dialog"
-    """
-
-    # expects List[Message] for messages
-    # this type semantically contains the output label whereas ChatCompletionInputType does not
-    type: Literal["dialog"] = "dialog"
-
-
-ParamType = Annotated[
-    StringType
-    | NumberType
-    | BooleanType
-    | ArrayType
-    | ObjectType
-    | JsonType
-    | UnionType
-    | ChatCompletionInputType
-    | CompletionInputType,
-    Field(discriminator="type"),
-]
-register_schema(ParamType, name="ParamType")
-
-"""
-# TODO: recursive definition of ParamType in these containers
-# will cause infinite recursion in OpenAPI generation script
-# since we are going with ChatCompletionInputType and CompletionInputType
-# we don't need to worry about ArrayType/ObjectType/UnionType for now
-ArrayType.model_rebuild()
-ObjectType.model_rebuild()
-UnionType.model_rebuild()
-
-
-class CustomType(BaseModel):
-pylint: disable=syntax-error
-    type: Literal["custom"] = "custom"
-    validator_class: str
-"""
--- a/src/llama_stack/apis/conversations/init.py
+++ b/src/llama_stack/apis/conversations/init.py
@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .conversations import (
-    Conversation,
-    ConversationDeletedResource,
-    ConversationItem,
-    ConversationItemCreateRequest,
-    ConversationItemDeletedResource,
-    ConversationItemList,
-    Conversations,
-    Metadata,
-)
-
-__all__ = [
-    "Conversation",
-    "ConversationDeletedResource",
-    "ConversationItem",
-    "ConversationItemCreateRequest",
-    "ConversationItemDeletedResource",
-    "ConversationItemList",
-    "Conversations",
-    "Metadata",
-]
--- a/src/llama_stack/apis/conversations/conversations.py
+++ b/src/llama_stack/apis/conversations/conversations.py
@ -1,272 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import StrEnum
-from typing import Annotated, Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.agents.openai_responses import (
-    OpenAIResponseInputFunctionToolCallOutput,
-    OpenAIResponseMCPApprovalRequest,
-    OpenAIResponseMCPApprovalResponse,
-    OpenAIResponseMessage,
-    OpenAIResponseOutputMessageFileSearchToolCall,
-    OpenAIResponseOutputMessageFunctionToolCall,
-    OpenAIResponseOutputMessageMCPCall,
-    OpenAIResponseOutputMessageMCPListTools,
-    OpenAIResponseOutputMessageWebSearchToolCall,
-)
-from llama_stack.apis.common.tracing import telemetry_traceable
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-Metadata = dict[str, str]
-
-
-@json_schema_type
-class Conversation(BaseModel):
-    """OpenAI-compatible conversation object."""
-
-    id: str = Field(..., description="The unique ID of the conversation.")
-    object: Literal["conversation"] = Field(
-        default="conversation", description="The object type, which is always conversation."
-    )
-    created_at: int = Field(
-        ..., description="The time at which the conversation was created, measured in seconds since the Unix epoch."
-    )
-    metadata: Metadata | None = Field(
-        default=None,
-        description="Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard.",
-    )
-    items: list[dict] | None = Field(
-        default=None,
-        description="Initial items to include in the conversation context. You may add up to 20 items at a time.",
-    )
-
-
-@json_schema_type
-class ConversationMessage(BaseModel):
-    """OpenAI-compatible message item for conversations."""
-
-    id: str = Field(..., description="unique identifier for this message")
-    content: list[dict] = Field(..., description="message content")
-    role: str = Field(..., description="message role")
-    status: str = Field(..., description="message status")
-    type: Literal["message"] = "message"
-    object: Literal["message"] = "message"
-
-
-ConversationItem = Annotated[
-    OpenAIResponseMessage
-    | OpenAIResponseOutputMessageWebSearchToolCall
-    | OpenAIResponseOutputMessageFileSearchToolCall
-    | OpenAIResponseOutputMessageFunctionToolCall
-    | OpenAIResponseInputFunctionToolCallOutput
-    | OpenAIResponseMCPApprovalRequest
-    | OpenAIResponseMCPApprovalResponse
-    | OpenAIResponseOutputMessageMCPCall
-    | OpenAIResponseOutputMessageMCPListTools
-    | OpenAIResponseOutputMessageMCPCall
-    | OpenAIResponseOutputMessageMCPListTools,
-    Field(discriminator="type"),
-]
-register_schema(ConversationItem, name="ConversationItem")
-
-# Using OpenAI types directly caused issues but some notes for reference:
-# Note that ConversationItem is a Annotated Union of the types below:
-# from openai.types.responses import *
-# from openai.types.responses.response_item import *
-# from openai.types.conversations import ConversationItem
-# f = [
-#     ResponseFunctionToolCallItem,
-#     ResponseFunctionToolCallOutputItem,
-#     ResponseFileSearchToolCall,
-#     ResponseFunctionWebSearch,
-#     ImageGenerationCall,
-#     ResponseComputerToolCall,
-#     ResponseComputerToolCallOutputItem,
-#     ResponseReasoningItem,
-#     ResponseCodeInterpreterToolCall,
-#     LocalShellCall,
-#     LocalShellCallOutput,
-#     McpListTools,
-#     McpApprovalRequest,
-#     McpApprovalResponse,
-#     McpCall,
-#     ResponseCustomToolCall,
-#     ResponseCustomToolCallOutput
-# ]
-
-
-@json_schema_type
-class ConversationDeletedResource(BaseModel):
-    """Response for deleted conversation."""
-
-    id: str = Field(..., description="The deleted conversation identifier")
-    object: str = Field(default="conversation.deleted", description="Object type")
-    deleted: bool = Field(default=True, description="Whether the object was deleted")
-
-
-@json_schema_type
-class ConversationItemCreateRequest(BaseModel):
-    """Request body for creating conversation items."""
-
-    items: list[ConversationItem] = Field(
-        ...,
-        description="Items to include in the conversation context. You may add up to 20 items at a time.",
-        max_length=20,
-    )
-
-
-class ConversationItemInclude(StrEnum):
-    """
-    Specify additional output data to include in the model response.
-    """
-
-    web_search_call_action_sources = "web_search_call.action.sources"
-    code_interpreter_call_outputs = "code_interpreter_call.outputs"
-    computer_call_output_output_image_url = "computer_call_output.output.image_url"
-    file_search_call_results = "file_search_call.results"
-    message_input_image_image_url = "message.input_image.image_url"
-    message_output_text_logprobs = "message.output_text.logprobs"
-    reasoning_encrypted_content = "reasoning.encrypted_content"
-
-
-@json_schema_type
-class ConversationItemList(BaseModel):
-    """List of conversation items with pagination."""
-
-    object: str = Field(default="list", description="Object type")
-    data: list[ConversationItem] = Field(..., description="List of conversation items")
-    first_id: str | None = Field(default=None, description="The ID of the first item in the list")
-    last_id: str | None = Field(default=None, description="The ID of the last item in the list")
-    has_more: bool = Field(default=False, description="Whether there are more items available")
-
-
-@json_schema_type
-class ConversationItemDeletedResource(BaseModel):
-    """Response for deleted conversation item."""
-
-    id: str = Field(..., description="The deleted item identifier")
-    object: str = Field(default="conversation.item.deleted", description="Object type")
-    deleted: bool = Field(default=True, description="Whether the object was deleted")
-
-
-@runtime_checkable
-@telemetry_traceable
-class Conversations(Protocol):
-    """Conversations
-
-    Protocol for conversation management operations."""
-
-    @webmethod(route="/conversations", method="POST", level=LLAMA_STACK_API_V1)
-    async def create_conversation(
-        self, items: list[ConversationItem] | None = None, metadata: Metadata | None = None
-    ) -> Conversation:
-        """Create a conversation.
-
-        Create a conversation.
-
-        :param items: Initial items to include in the conversation context.
-        :param metadata: Set of key-value pairs that can be attached to an object.
-        :returns: The created conversation object.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_conversation(self, conversation_id: str) -> Conversation:
-        """Retrieve a conversation.
-
-        Get a conversation with the given ID.
-
-        :param conversation_id: The conversation identifier.
-        :returns: The conversation object.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}", method="POST", level=LLAMA_STACK_API_V1)
-    async def update_conversation(self, conversation_id: str, metadata: Metadata) -> Conversation:
-        """Update a conversation.
-
-        Update a conversation's metadata with the given ID.
-
-        :param conversation_id: The conversation identifier.
-        :param metadata: Set of key-value pairs that can be attached to an object.
-        :returns: The updated conversation object.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def openai_delete_conversation(self, conversation_id: str) -> ConversationDeletedResource:
-        """Delete a conversation.
-
-        Delete a conversation with the given ID.
-
-        :param conversation_id: The conversation identifier.
-        :returns: The deleted conversation resource.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}/items", method="POST", level=LLAMA_STACK_API_V1)
-    async def add_items(self, conversation_id: str, items: list[ConversationItem]) -> ConversationItemList:
-        """Create items.
-
-        Create items in the conversation.
-
-        :param conversation_id: The conversation identifier.
-        :param items: Items to include in the conversation context.
-        :returns: List of created items.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}/items/{item_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def retrieve(self, conversation_id: str, item_id: str) -> ConversationItem:
-        """Retrieve an item.
-
-        Retrieve a conversation item.
-
-        :param conversation_id: The conversation identifier.
-        :param item_id: The item identifier.
-        :returns: The conversation item.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}/items", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_items(
-        self,
-        conversation_id: str,
-        after: str | None = None,
-        include: list[ConversationItemInclude] | None = None,
-        limit: int | None = None,
-        order: Literal["asc", "desc"] | None = None,
-    ) -> ConversationItemList:
-        """List items.
-
-        List items in the conversation.
-
-        :param conversation_id: The conversation identifier.
-        :param after: An item ID to list items after, used in pagination.
-        :param include: Specify additional output data to include in the response.
-        :param limit: A limit on the number of objects to be returned (1-100, default 20).
-        :param order: The order to return items in (asc or desc, default desc).
-        :returns: List of conversation items.
-        """
-        ...
-
-    @webmethod(route="/conversations/{conversation_id}/items/{item_id}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def openai_delete_conversation_item(
-        self, conversation_id: str, item_id: str
-    ) -> ConversationItemDeletedResource:
-        """Delete an item.
-
-        Delete a conversation item.
-
-        :param conversation_id: The conversation identifier.
-        :param item_id: The item identifier.
-        :returns: The deleted item resource.
-        """
-        ...
--- a/src/llama_stack/apis/datasetio/init.py
+++ b/src/llama_stack/apis/datasetio/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .datasetio import *
--- a/src/llama_stack/apis/datasetio/datasetio.py
+++ b/src/llama_stack/apis/datasetio/datasetio.py
@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Protocol, runtime_checkable
-
-from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.apis.datasets import Dataset
-from llama_stack.apis.version import LLAMA_STACK_API_V1BETA
-from llama_stack.schema_utils import webmethod
-
-
-class DatasetStore(Protocol):
-    def get_dataset(self, dataset_id: str) -> Dataset: ...
-
-
-@runtime_checkable
-class DatasetIO(Protocol):
-    # keeping for aligning with inference/safety, but this is not used
-    dataset_store: DatasetStore
-
-    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1BETA)
-    async def iterrows(
-        self,
-        dataset_id: str,
-        start_index: int | None = None,
-        limit: int | None = None,
-    ) -> PaginatedResponse:
-        """Get a paginated list of rows from a dataset.
-
-        Uses offset-based pagination where:
-        - start_index: The starting index (0-based). If None, starts from beginning.
-        - limit: Number of items to return. If None or -1, returns all items.
-
-        The response includes:
-        - data: List of items for the current page.
-        - has_more: Whether there are more items available after this set.
-
-        :param dataset_id: The ID of the dataset to get the rows from.
-        :param start_index: Index into dataset for the first row to get. Get all rows if None.
-        :param limit: The number of rows to get.
-        :returns: A PaginatedResponse.
-        """
-        ...
-
-    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST", level=LLAMA_STACK_API_V1BETA)
-    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
-        """Append rows to a dataset.
-
-        :param dataset_id: The ID of the dataset to append the rows to.
-        :param rows: The rows to append to the dataset.
-        """
-        ...
--- a/src/llama_stack/apis/datasets/init.py
+++ b/src/llama_stack/apis/datasets/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .datasets import *
--- a/src/llama_stack/apis/datasets/datasets.py
+++ b/src/llama_stack/apis/datasets/datasets.py
@ -1,247 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum, StrEnum
-from typing import Annotated, Any, Literal, Protocol
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1BETA
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-class DatasetPurpose(StrEnum):
-    """
-    Purpose of the dataset. Each purpose has a required input data schema.
-
-    :cvar post-training/messages: The dataset contains messages used for post-training.
-        {
-            "messages": [
-                {"role": "user", "content": "Hello, world!"},
-                {"role": "assistant", "content": "Hello, world!"},
-            ]
-        }
-    :cvar eval/question-answer: The dataset contains a question column and an answer column.
-        {
-            "question": "What is the capital of France?",
-            "answer": "Paris"
-        }
-    :cvar eval/messages-answer: The dataset contains a messages column with list of messages and an answer column.
-        {
-            "messages": [
-                {"role": "user", "content": "Hello, my name is John Doe."},
-                {"role": "assistant", "content": "Hello, John Doe. How can I help you today?"},
-                {"role": "user", "content": "What's my name?"},
-            ],
-            "answer": "John Doe"
-        }
-    """
-
-    post_training_messages = "post-training/messages"
-    eval_question_answer = "eval/question-answer"
-    eval_messages_answer = "eval/messages-answer"
-
-    # TODO: add more schemas here
-
-
-class DatasetType(Enum):
-    """
-    Type of the dataset source.
-    :cvar uri: The dataset can be obtained from a URI.
-    :cvar rows: The dataset is stored in rows.
-    """
-
-    uri = "uri"
-    rows = "rows"
-
-
-@json_schema_type
-class URIDataSource(BaseModel):
-    """A dataset that can be obtained from a URI.
-    :param uri: The dataset can be obtained from a URI. E.g.
-        - "https://mywebsite.com/mydata.jsonl"
-        - "lsfs://mydata.jsonl"
-        - "data:csv;base64,{base64_content}"
-    """
-
-    type: Literal["uri"] = "uri"
-    uri: str
-
-
-@json_schema_type
-class RowsDataSource(BaseModel):
-    """A dataset stored in rows.
-    :param rows: The dataset is stored in rows. E.g.
-        - [
-            {"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}
-        ]
-    """
-
-    type: Literal["rows"] = "rows"
-    rows: list[dict[str, Any]]
-
-
-DataSource = Annotated[
-    URIDataSource | RowsDataSource,
-    Field(discriminator="type"),
-]
-register_schema(DataSource, name="DataSource")
-
-
-class CommonDatasetFields(BaseModel):
-    """
-    Common fields for a dataset.
-
-    :param purpose: Purpose of the dataset indicating its intended use
-    :param source: Data source configuration for the dataset
-    :param metadata: Additional metadata for the dataset
-    """
-
-    purpose: DatasetPurpose
-    source: DataSource
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this dataset",
-    )
-
-
-@json_schema_type
-class Dataset(CommonDatasetFields, Resource):
-    """Dataset resource for storing and accessing training or evaluation data.
-
-    :param type: Type of resource, always 'dataset' for datasets
-    """
-
-    type: Literal[ResourceType.dataset] = ResourceType.dataset
-
-    @property
-    def dataset_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_dataset_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class DatasetInput(CommonDatasetFields, BaseModel):
-    """Input parameters for dataset operations.
-
-    :param dataset_id: Unique identifier for the dataset
-    """
-
-    dataset_id: str
-
-
-class ListDatasetsResponse(BaseModel):
-    """Response from listing datasets.
-
-    :param data: List of datasets
-    """
-
-    data: list[Dataset]
-
-
-class Datasets(Protocol):
-    @webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1BETA, deprecated=True)
-    async def register_dataset(
-        self,
-        purpose: DatasetPurpose,
-        source: DataSource,
-        metadata: dict[str, Any] | None = None,
-        dataset_id: str | None = None,
-    ) -> Dataset:
-        """
-        Register a new dataset.
-
-        :param purpose: The purpose of the dataset.
-        One of:
-            - "post-training/messages": The dataset contains a messages column with list of messages for post-training.
-                {
-                    "messages": [
-                        {"role": "user", "content": "Hello, world!"},
-                        {"role": "assistant", "content": "Hello, world!"},
-                    ]
-                }
-            - "eval/question-answer": The dataset contains a question column and an answer column for evaluation.
-                {
-                    "question": "What is the capital of France?",
-                    "answer": "Paris"
-                }
-            - "eval/messages-answer": The dataset contains a messages column with list of messages and an answer column for evaluation.
-                {
-                    "messages": [
-                        {"role": "user", "content": "Hello, my name is John Doe."},
-                        {"role": "assistant", "content": "Hello, John Doe. How can I help you today?"},
-                        {"role": "user", "content": "What's my name?"},
-                    ],
-                    "answer": "John Doe"
-                }
-        :param source: The data source of the dataset. Ensure that the data source schema is compatible with the purpose of the dataset. Examples:
-           - {
-               "type": "uri",
-               "uri": "https://mywebsite.com/mydata.jsonl"
-           }
-           - {
-               "type": "uri",
-               "uri": "lsfs://mydata.jsonl"
-           }
-           - {
-               "type": "uri",
-               "uri": "data:csv;base64,{base64_content}"
-           }
-           - {
-               "type": "uri",
-               "uri": "huggingface://llamastack/simpleqa?split=train"
-           }
-           - {
-               "type": "rows",
-               "rows": [
-                   {
-                       "messages": [
-                           {"role": "user", "content": "Hello, world!"},
-                           {"role": "assistant", "content": "Hello, world!"},
-                       ]
-                   }
-               ]
-           }
-        :param metadata: The metadata for the dataset.
-           - E.g. {"description": "My dataset"}.
-        :param dataset_id: The ID of the dataset. If not provided, an ID will be generated.
-        :returns: A Dataset.
-        """
-        ...
-
-    @webmethod(route="/datasets/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1BETA)
-    async def get_dataset(
-        self,
-        dataset_id: str,
-    ) -> Dataset:
-        """Get a dataset by its ID.
-
-        :param dataset_id: The ID of the dataset to get.
-        :returns: A Dataset.
-        """
-        ...
-
-    @webmethod(route="/datasets", method="GET", level=LLAMA_STACK_API_V1BETA)
-    async def list_datasets(self) -> ListDatasetsResponse:
-        """List all datasets.
-
-        :returns: A ListDatasetsResponse.
-        """
-        ...
-
-    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1BETA, deprecated=True)
-    async def unregister_dataset(
-        self,
-        dataset_id: str,
-    ) -> None:
-        """Unregister a dataset by its ID.
-
-        :param dataset_id: The ID of the dataset to unregister.
-        """
-        ...
--- a/src/llama_stack/apis/datatypes.py
+++ b/src/llama_stack/apis/datatypes.py
@ -1,158 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum, EnumMeta
-
-from pydantic import BaseModel, Field
-
-from llama_stack.schema_utils import json_schema_type
-
-
-class DynamicApiMeta(EnumMeta):
-    def __new__(cls, name, bases, namespace):
-        # Store the original enum values
-        original_values = {k: v for k, v in namespace.items() if not k.startswith("_")}
-
-        # Create the enum class
-        cls = super().__new__(cls, name, bases, namespace)
-
-        # Store the original values for reference
-        cls._original_values = original_values
-        # Initialize _dynamic_values
-        cls._dynamic_values = {}
-
-        return cls
-
-    def __call__(cls, value):
-        try:
-            return super().__call__(value)
-        except ValueError as e:
-            # If this value was already dynamically added, return it
-            if value in cls._dynamic_values:
-                return cls._dynamic_values[value]
-
-            # If the value doesn't exist, create a new enum member
-            # Create a new member name from the value
-            member_name = value.lower().replace("-", "_")
-
-            # If this member name already exists in the enum, return the existing member
-            if member_name in cls._member_map_:
-                return cls._member_map_[member_name]
-
-            # Instead of creating a new member, raise ValueError to force users to use Api.add() to
-            # register new APIs explicitly
-            raise ValueError(f"API '{value}' does not exist. Use Api.add() to register new APIs.") from e
-
-    def __iter__(cls):
-        # Allow iteration over both static and dynamic members
-        yield from super().__iter__()
-        if hasattr(cls, "_dynamic_values"):
-            yield from cls._dynamic_values.values()
-
-    def add(cls, value):
-        """
-        Add a new API to the enum.
-        Used to register external APIs.
-        """
-        member_name = value.lower().replace("-", "_")
-
-        # If this member name already exists in the enum, return it
-        if member_name in cls._member_map_:
-            return cls._member_map_[member_name]
-
-        # Create a new enum member
-        member = object.__new__(cls)
-        member._name_ = member_name
-        member._value_ = value
-
-        # Add it to the enum class
-        cls._member_map_[member_name] = member
-        cls._member_names_.append(member_name)
-        cls._member_type_ = str
-
-        # Store it in our dynamic values
-        cls._dynamic_values[value] = member
-
-        return member
-
-
-@json_schema_type
-class Api(Enum, metaclass=DynamicApiMeta):
-    """Enumeration of all available APIs in the Llama Stack system.
-    :cvar providers: Provider management and configuration
-    :cvar inference: Text generation, chat completions, and embeddings
-    :cvar safety: Content moderation and safety shields
-    :cvar agents: Agent orchestration and execution
-    :cvar batches: Batch processing for asynchronous API requests
-    :cvar vector_io: Vector database operations and queries
-    :cvar datasetio: Dataset input/output operations
-    :cvar scoring: Model output evaluation and scoring
-    :cvar eval: Model evaluation and benchmarking framework
-    :cvar post_training: Fine-tuning and model training
-    :cvar tool_runtime: Tool execution and management
-    :cvar telemetry: Observability and system monitoring
-    :cvar models: Model metadata and management
-    :cvar shields: Safety shield implementations
-    :cvar datasets: Dataset creation and management
-    :cvar scoring_functions: Scoring function definitions
-    :cvar benchmarks: Benchmark suite management
-    :cvar tool_groups: Tool group organization
-    :cvar files: File storage and management
-    :cvar prompts: Prompt versions and management
-    :cvar inspect: Built-in system inspection and introspection
-    """
-
-    providers = "providers"
-    inference = "inference"
-    safety = "safety"
-    agents = "agents"
-    batches = "batches"
-    vector_io = "vector_io"
-    datasetio = "datasetio"
-    scoring = "scoring"
-    eval = "eval"
-    post_training = "post_training"
-    tool_runtime = "tool_runtime"
-
-    models = "models"
-    shields = "shields"
-    vector_stores = "vector_stores"  # only used for routing table
-    datasets = "datasets"
-    scoring_functions = "scoring_functions"
-    benchmarks = "benchmarks"
-    tool_groups = "tool_groups"
-    files = "files"
-    prompts = "prompts"
-    conversations = "conversations"
-
-    # built-in API
-    inspect = "inspect"
-
-
-@json_schema_type
-class Error(BaseModel):
-    """
-    Error response from the API. Roughly follows RFC 7807.
-
-    :param status: HTTP status code
-    :param title: Error title, a short summary of the error which is invariant for an error type
-    :param detail: Error detail, a longer human-readable description of the error
-    :param instance: (Optional) A URL which can be used to retrieve more information about the specific occurrence of the error
-    """
-
-    status: int
-    title: str
-    detail: str
-    instance: str | None = None
-
-
-class ExternalApiSpec(BaseModel):
-    """Specification for an external API implementation."""
-
-    module: str = Field(..., description="Python module containing the API implementation")
-    name: str = Field(..., description="Name of the API")
-    pip_packages: list[str] = Field(default=[], description="List of pip packages to install the API")
-    protocol: str = Field(..., description="Name of the protocol class for the API")
--- a/src/llama_stack/apis/eval/init.py
+++ b/src/llama_stack/apis/eval/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .eval import *
--- a/src/llama_stack/apis/eval/eval.py
+++ b/src/llama_stack/apis/eval/eval.py
@ -1,137 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Literal, Protocol
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.common.job_types import Job
-from llama_stack.apis.inference import SamplingParams, SystemMessage
-from llama_stack.apis.scoring import ScoringResult
-from llama_stack.apis.scoring_functions import ScoringFnParams
-from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-@json_schema_type
-class ModelCandidate(BaseModel):
-    """A model candidate for evaluation.
-
-    :param model: The model ID to evaluate.
-    :param sampling_params: The sampling parameters for the model.
-    :param system_message: (Optional) The system message providing instructions or context to the model.
-    """
-
-    type: Literal["model"] = "model"
-    model: str
-    sampling_params: SamplingParams
-    system_message: SystemMessage | None = None
-
-
-EvalCandidate = ModelCandidate
-
-
-@json_schema_type
-class BenchmarkConfig(BaseModel):
-    """A benchmark configuration for evaluation.
-
-    :param eval_candidate: The candidate to evaluate.
-    :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run
-    :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated
-    """
-
-    eval_candidate: EvalCandidate
-    scoring_params: dict[str, ScoringFnParams] = Field(
-        description="Map between scoring function id and parameters for each scoring function you want to run",
-        default_factory=dict,
-    )
-    num_examples: int | None = Field(
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        default=None,
-    )
-    # we could optinally add any specific dataset config here
-
-
-@json_schema_type
-class EvaluateResponse(BaseModel):
-    """The response from an evaluation.
-
-    :param generations: The generations from the evaluation.
-    :param scores: The scores from the evaluation.
-    """
-
-    generations: list[dict[str, Any]]
-    # each key in the dict is a scoring function name
-    scores: dict[str, ScoringResult]
-
-
-class Eval(Protocol):
-    """Evaluations
-
-    Llama Stack Evaluation API for running evaluations on model and agent candidates."""
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def run_eval(
-        self,
-        benchmark_id: str,
-        benchmark_config: BenchmarkConfig,
-    ) -> Job:
-        """Run an evaluation on a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param benchmark_config: The configuration for the benchmark.
-        :returns: The job that was created to run the evaluation.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def evaluate_rows(
-        self,
-        benchmark_id: str,
-        input_rows: list[dict[str, Any]],
-        scoring_functions: list[str],
-        benchmark_config: BenchmarkConfig,
-    ) -> EvaluateResponse:
-        """Evaluate a list of rows on a benchmark.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param input_rows: The rows to evaluate.
-        :param scoring_functions: The scoring functions to use for the evaluation.
-        :param benchmark_config: The configuration for the benchmark.
-        :returns: EvaluateResponse object containing generations and scores.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def job_status(self, benchmark_id: str, job_id: str) -> Job:
-        """Get the status of a job.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param job_id: The ID of the job to get the status of.
-        :returns: The status of the evaluation job.
-        """
-        ...
-
-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
-    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
-        """Cancel a job.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param job_id: The ID of the job to cancel.
-        """
-        ...
-
-    @webmethod(
-        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET", level=LLAMA_STACK_API_V1ALPHA
-    )
-    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
-        """Get the result of a job.
-
-        :param benchmark_id: The ID of the benchmark to run the evaluation on.
-        :param job_id: The ID of the job to get the result of.
-        :returns: The result of the job.
-        """
-        ...
--- a/src/llama_stack/apis/files/init.py
+++ b/src/llama_stack/apis/files/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .files import *
--- a/src/llama_stack/apis/files/files.py
+++ b/src/llama_stack/apis/files/files.py
@ -1,194 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import StrEnum
-from typing import Annotated, ClassVar, Literal, Protocol, runtime_checkable
-
-from fastapi import File, Form, Response, UploadFile
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.common.responses import Order
-from llama_stack.apis.common.tracing import telemetry_traceable
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-# OpenAI Files API Models
-class OpenAIFilePurpose(StrEnum):
-    """
-    Valid purpose values for OpenAI Files API.
-    """
-
-    ASSISTANTS = "assistants"
-    BATCH = "batch"
-    # TODO: Add other purposes as needed
-
-
-@json_schema_type
-class OpenAIFileObject(BaseModel):
-    """
-    OpenAI File object as defined in the OpenAI Files API.
-
-    :param object: The object type, which is always "file"
-    :param id: The file identifier, which can be referenced in the API endpoints
-    :param bytes: The size of the file, in bytes
-    :param created_at: The Unix timestamp (in seconds) for when the file was created
-    :param expires_at: The Unix timestamp (in seconds) for when the file expires
-    :param filename: The name of the file
-    :param purpose: The intended purpose of the file
-    """
-
-    object: Literal["file"] = "file"
-    id: str
-    bytes: int
-    created_at: int
-    expires_at: int
-    filename: str
-    purpose: OpenAIFilePurpose
-
-
-@json_schema_type
-class ExpiresAfter(BaseModel):
-    """
-    Control expiration of uploaded files.
-
-    Params:
-     - anchor, must be "created_at"
-     - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
-    """
-
-    MIN: ClassVar[int] = 3600  # 1 hour
-    MAX: ClassVar[int] = 2592000  # 30 days
-
-    anchor: Literal["created_at"]
-    seconds: int = Field(..., ge=3600, le=2592000)
-
-
-@json_schema_type
-class ListOpenAIFileResponse(BaseModel):
-    """
-    Response for listing files in OpenAI Files API.
-
-    :param data: List of file objects
-    :param has_more: Whether there are more files available beyond this page
-    :param first_id: ID of the first file in the list for pagination
-    :param last_id: ID of the last file in the list for pagination
-    :param object: The object type, which is always "list"
-    """
-
-    data: list[OpenAIFileObject]
-    has_more: bool
-    first_id: str
-    last_id: str
-    object: Literal["list"] = "list"
-
-
-@json_schema_type
-class OpenAIFileDeleteResponse(BaseModel):
-    """
-    Response for deleting a file in OpenAI Files API.
-
-    :param id: The file identifier that was deleted
-    :param object: The object type, which is always "file"
-    :param deleted: Whether the file was successfully deleted
-    """
-
-    id: str
-    object: Literal["file"] = "file"
-    deleted: bool
-
-
-@runtime_checkable
-@telemetry_traceable
-class Files(Protocol):
-    """Files
-
-    This API is used to upload documents that can be used with other Llama Stack APIs.
-    """
-
-    # OpenAI Files API Endpoints
-    @webmethod(route="/files", method="POST", level=LLAMA_STACK_API_V1)
-    async def openai_upload_file(
-        self,
-        file: Annotated[UploadFile, File()],
-        purpose: Annotated[OpenAIFilePurpose, Form()],
-        expires_after: Annotated[ExpiresAfter | None, Form()] = None,
-    ) -> OpenAIFileObject:
-        """Upload file.
-
-        Upload a file that can be used across various endpoints.
-
-        The file upload should be a multipart form request with:
-        - file: The File object (not file name) to be uploaded.
-        - purpose: The intended purpose of the uploaded file.
-        - expires_after: Optional form values describing expiration for the file.
-
-        :param file: The uploaded file object containing content and metadata (filename, content_type, etc.).
-        :param purpose: The intended purpose of the uploaded file (e.g., "assistants", "fine-tune").
-        :param expires_after: Optional form values describing expiration for the file.
-        :returns: An OpenAIFileObject representing the uploaded file.
-        """
-        ...
-
-    @webmethod(route="/files", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_list_files(
-        self,
-        after: str | None = None,
-        limit: int | None = 10000,
-        order: Order | None = Order.desc,
-        purpose: OpenAIFilePurpose | None = None,
-    ) -> ListOpenAIFileResponse:
-        """List files.
-
-        Returns a list of files that belong to the user's organization.
-
-        :param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list. For instance, if you make a list request and receive 100 objects, ending with obj_foo, your subsequent call can include after=obj_foo in order to fetch the next page of the list.
-        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 10,000, and the default is 10,000.
-        :param order: Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.
-        :param purpose: Only return files with the given purpose.
-        :returns: An ListOpenAIFileResponse containing the list of files.
-        """
-        ...
-
-    @webmethod(route="/files/{file_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_retrieve_file(
-        self,
-        file_id: str,
-    ) -> OpenAIFileObject:
-        """Retrieve file.
-
-        Returns information about a specific file.
-
-        :param file_id: The ID of the file to use for this request.
-        :returns: An OpenAIFileObject containing file information.
-        """
-        ...
-
-    @webmethod(route="/files/{file_id}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def openai_delete_file(
-        self,
-        file_id: str,
-    ) -> OpenAIFileDeleteResponse:
-        """Delete file.
-
-        :param file_id: The ID of the file to use for this request.
-        :returns: An OpenAIFileDeleteResponse indicating successful deletion.
-        """
-        ...
-
-    @webmethod(route="/files/{file_id}/content", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_retrieve_file_content(
-        self,
-        file_id: str,
-    ) -> Response:
-        """Retrieve file content.
-
-        Returns the contents of the specified file.
-
-        :param file_id: The ID of the file to use for this request.
-        :returns: The raw file content as a binary response.
-        """
-        ...
--- a/src/llama_stack/apis/inference/init.py
+++ b/src/llama_stack/apis/inference/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .inference import *
--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
--- a/src/llama_stack/apis/inspect/init.py
+++ b/src/llama_stack/apis/inspect/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .inspect import *
--- a/src/llama_stack/apis/inspect/inspect.py
+++ b/src/llama_stack/apis/inspect/inspect.py
@ -1,102 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel
-
-from llama_stack.apis.version import (
-    LLAMA_STACK_API_V1,
-)
-from llama_stack.providers.datatypes import HealthStatus
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-# Valid values for the route filter parameter.
-# Actual API levels: v1, v1alpha, v1beta (filters by level, excludes deprecated)
-# Special filter value: "deprecated" (shows deprecated routes regardless of level)
-ApiFilter = Literal["v1", "v1alpha", "v1beta", "deprecated"]
-
-
-@json_schema_type
-class RouteInfo(BaseModel):
-    """Information about an API route including its path, method, and implementing providers.
-
-    :param route: The API endpoint path
-    :param method: HTTP method for the route
-    :param provider_types: List of provider types that implement this route
-    """
-
-    route: str
-    method: str
-    provider_types: list[str]
-
-
-@json_schema_type
-class HealthInfo(BaseModel):
-    """Health status information for the service.
-
-    :param status: Current health status of the service
-    """
-
-    status: HealthStatus
-
-
-@json_schema_type
-class VersionInfo(BaseModel):
-    """Version information for the service.
-
-    :param version: Version number of the service
-    """
-
-    version: str
-
-
-class ListRoutesResponse(BaseModel):
-    """Response containing a list of all available API routes.
-
-    :param data: List of available route information objects
-    """
-
-    data: list[RouteInfo]
-
-
-@runtime_checkable
-class Inspect(Protocol):
-    """Inspect
-
-    APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.
-    """
-
-    @webmethod(route="/inspect/routes", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_routes(self, api_filter: ApiFilter | None = None) -> ListRoutesResponse:
-        """List routes.
-
-        List all available API routes with their methods and implementing providers.
-
-        :param api_filter: Optional filter to control which routes are returned. Can be an API level ('v1', 'v1alpha', 'v1beta') to show non-deprecated routes at that level, or 'deprecated' to show deprecated routes across all levels. If not specified, returns all non-deprecated routes.
-        :returns: Response containing information about all available routes.
-        """
-        ...
-
-    @webmethod(route="/health", method="GET", level=LLAMA_STACK_API_V1, require_authentication=False)
-    async def health(self) -> HealthInfo:
-        """Get health status.
-
-        Get the current health status of the service.
-
-        :returns: Health information indicating if the service is operational.
-        """
-        ...
-
-    @webmethod(route="/version", method="GET", level=LLAMA_STACK_API_V1, require_authentication=False)
-    async def version(self) -> VersionInfo:
-        """Get version.
-
-        Get the version of the service.
-
-        :returns: Version information containing the service version number.
-        """
-        ...
--- a/src/llama_stack/apis/models/init.py
+++ b/src/llama_stack/apis/models/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .models import *
--- a/src/llama_stack/apis/models/models.py
+++ b/src/llama_stack/apis/models/models.py
@ -1,172 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import StrEnum
-from typing import Any, Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel, ConfigDict, Field, field_validator
-
-from llama_stack.apis.common.tracing import telemetry_traceable
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-class CommonModelFields(BaseModel):
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this model",
-    )
-
-
-@json_schema_type
-class ModelType(StrEnum):
-    """Enumeration of supported model types in Llama Stack.
-    :cvar llm: Large language model for text generation and completion
-    :cvar embedding: Embedding model for converting text to vector representations
-    :cvar rerank: Reranking model for reordering documents based on their relevance to a query
-    """
-
-    llm = "llm"
-    embedding = "embedding"
-    rerank = "rerank"
-
-
-@json_schema_type
-class Model(CommonModelFields, Resource):
-    """A model resource representing an AI model registered in Llama Stack.
-
-    :param type: The resource type, always 'model' for model resources
-    :param model_type: The type of model (LLM or embedding model)
-    :param metadata: Any additional metadata for this model
-    :param identifier: Unique identifier for this resource in llama stack
-    :param provider_resource_id: Unique identifier for this resource in the provider
-    :param provider_id: ID of the provider that owns this resource
-    """
-
-    type: Literal[ResourceType.model] = ResourceType.model
-
-    @property
-    def model_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_model_id(self) -> str:
-        assert self.provider_resource_id is not None, "Provider resource ID must be set"
-        return self.provider_resource_id
-
-    model_config = ConfigDict(protected_namespaces=())
-
-    model_type: ModelType = Field(default=ModelType.llm)
-
-    @field_validator("provider_resource_id")
-    @classmethod
-    def validate_provider_resource_id(cls, v):
-        if v is None:
-            raise ValueError("provider_resource_id cannot be None")
-        return v
-
-
-class ModelInput(CommonModelFields):
-    model_id: str
-    provider_id: str | None = None
-    provider_model_id: str | None = None
-    model_type: ModelType | None = ModelType.llm
-    model_config = ConfigDict(protected_namespaces=())
-
-
-class ListModelsResponse(BaseModel):
-    data: list[Model]
-
-
-@json_schema_type
-class OpenAIModel(BaseModel):
-    """A model from OpenAI.
-
-    :id: The ID of the model
-    :object: The object type, which will be "model"
-    :created: The Unix timestamp in seconds when the model was created
-    :owned_by: The owner of the model
-    :custom_metadata: Llama Stack-specific metadata including model_type, provider info, and additional metadata
-    """
-
-    id: str
-    object: Literal["model"] = "model"
-    created: int
-    owned_by: str
-    custom_metadata: dict[str, Any] | None = None
-
-
-class OpenAIListModelsResponse(BaseModel):
-    data: list[OpenAIModel]
-
-
-@runtime_checkable
-@telemetry_traceable
-class Models(Protocol):
-    async def list_models(self) -> ListModelsResponse:
-        """List all models.
-
-        :returns: A ListModelsResponse.
-        """
-        ...
-
-    @webmethod(route="/models", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_list_models(self) -> OpenAIListModelsResponse:
-        """List models using the OpenAI API.
-
-        :returns: A OpenAIListModelsResponse.
-        """
-        ...
-
-    @webmethod(route="/models/{model_id:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_model(
-        self,
-        model_id: str,
-    ) -> Model:
-        """Get model.
-
-        Get a model by its identifier.
-
-        :param model_id: The identifier of the model to get.
-        :returns: A Model.
-        """
-        ...
-
-    @webmethod(route="/models", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
-    async def register_model(
-        self,
-        model_id: str,
-        provider_model_id: str | None = None,
-        provider_id: str | None = None,
-        metadata: dict[str, Any] | None = None,
-        model_type: ModelType | None = None,
-    ) -> Model:
-        """Register model.
-
-        Register a model.
-
-        :param model_id: The identifier of the model to register.
-        :param provider_model_id: The identifier of the model in the provider.
-        :param provider_id: The identifier of the provider.
-        :param metadata: Any additional metadata for this model.
-        :param model_type: The type of model to register.
-        :returns: A Model.
-        """
-        ...
-
-    @webmethod(route="/models/{model_id:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
-    async def unregister_model(
-        self,
-        model_id: str,
-    ) -> None:
-        """Unregister model.
-
-        Unregister a model.
-
-        :param model_id: The identifier of the model to unregister.
-        """
-        ...
--- a/src/llama_stack/apis/post_training/init.py
+++ b/src/llama_stack/apis/post_training/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .post_training import *
--- a/src/llama_stack/apis/post_training/post_training.py
+++ b/src/llama_stack/apis/post_training/post_training.py
@ -1,368 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from datetime import datetime
-from enum import Enum
-from typing import Annotated, Any, Literal, Protocol
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.job_types import JobStatus
-from llama_stack.apis.common.training_types import Checkpoint
-from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-@json_schema_type
-class OptimizerType(Enum):
-    """Available optimizer algorithms for training.
-    :cvar adam: Adaptive Moment Estimation optimizer
-    :cvar adamw: AdamW optimizer with weight decay
-    :cvar sgd: Stochastic Gradient Descent optimizer
-    """
-
-    adam = "adam"
-    adamw = "adamw"
-    sgd = "sgd"
-
-
-@json_schema_type
-class DatasetFormat(Enum):
-    """Format of the training dataset.
-    :cvar instruct: Instruction-following format with prompt and completion
-    :cvar dialog: Multi-turn conversation format with messages
-    """
-
-    instruct = "instruct"
-    dialog = "dialog"
-
-
-@json_schema_type
-class DataConfig(BaseModel):
-    """Configuration for training data and data loading.
-
-    :param dataset_id: Unique identifier for the training dataset
-    :param batch_size: Number of samples per training batch
-    :param shuffle: Whether to shuffle the dataset during training
-    :param data_format: Format of the dataset (instruct or dialog)
-    :param validation_dataset_id: (Optional) Unique identifier for the validation dataset
-    :param packed: (Optional) Whether to pack multiple samples into a single sequence for efficiency
-    :param train_on_input: (Optional) Whether to compute loss on input tokens as well as output tokens
-    """
-
-    dataset_id: str
-    batch_size: int
-    shuffle: bool
-    data_format: DatasetFormat
-    validation_dataset_id: str | None = None
-    packed: bool | None = False
-    train_on_input: bool | None = False
-
-
-@json_schema_type
-class OptimizerConfig(BaseModel):
-    """Configuration parameters for the optimization algorithm.
-
-    :param optimizer_type: Type of optimizer to use (adam, adamw, or sgd)
-    :param lr: Learning rate for the optimizer
-    :param weight_decay: Weight decay coefficient for regularization
-    :param num_warmup_steps: Number of steps for learning rate warmup
-    """
-
-    optimizer_type: OptimizerType
-    lr: float
-    weight_decay: float
-    num_warmup_steps: int
-
-
-@json_schema_type
-class EfficiencyConfig(BaseModel):
-    """Configuration for memory and compute efficiency optimizations.
-
-    :param enable_activation_checkpointing: (Optional) Whether to use activation checkpointing to reduce memory usage
-    :param enable_activation_offloading: (Optional) Whether to offload activations to CPU to save GPU memory
-    :param memory_efficient_fsdp_wrap: (Optional) Whether to use memory-efficient FSDP wrapping
-    :param fsdp_cpu_offload: (Optional) Whether to offload FSDP parameters to CPU
-    """
-
-    enable_activation_checkpointing: bool | None = False
-    enable_activation_offloading: bool | None = False
-    memory_efficient_fsdp_wrap: bool | None = False
-    fsdp_cpu_offload: bool | None = False
-
-
-@json_schema_type
-class TrainingConfig(BaseModel):
-    """Comprehensive configuration for the training process.
-
-    :param n_epochs: Number of training epochs to run
-    :param max_steps_per_epoch: Maximum number of steps to run per epoch
-    :param gradient_accumulation_steps: Number of steps to accumulate gradients before updating
-    :param max_validation_steps: (Optional) Maximum number of validation steps per epoch
-    :param data_config: (Optional) Configuration for data loading and formatting
-    :param optimizer_config: (Optional) Configuration for the optimization algorithm
-    :param efficiency_config: (Optional) Configuration for memory and compute optimizations
-    :param dtype: (Optional) Data type for model parameters (bf16, fp16, fp32)
-    """
-
-    n_epochs: int
-    max_steps_per_epoch: int = 1
-    gradient_accumulation_steps: int = 1
-    max_validation_steps: int | None = 1
-    data_config: DataConfig | None = None
-    optimizer_config: OptimizerConfig | None = None
-    efficiency_config: EfficiencyConfig | None = None
-    dtype: str | None = "bf16"
-
-
-@json_schema_type
-class LoraFinetuningConfig(BaseModel):
-    """Configuration for Low-Rank Adaptation (LoRA) fine-tuning.
-
-    :param type: Algorithm type identifier, always "LoRA"
-    :param lora_attn_modules: List of attention module names to apply LoRA to
-    :param apply_lora_to_mlp: Whether to apply LoRA to MLP layers
-    :param apply_lora_to_output: Whether to apply LoRA to output projection layers
-    :param rank: Rank of the LoRA adaptation (lower rank = fewer parameters)
-    :param alpha: LoRA scaling parameter that controls adaptation strength
-    :param use_dora: (Optional) Whether to use DoRA (Weight-Decomposed Low-Rank Adaptation)
-    :param quantize_base: (Optional) Whether to quantize the base model weights
-    """
-
-    type: Literal["LoRA"] = "LoRA"
-    lora_attn_modules: list[str]
-    apply_lora_to_mlp: bool
-    apply_lora_to_output: bool
-    rank: int
-    alpha: int
-    use_dora: bool | None = False
-    quantize_base: bool | None = False
-
-
-@json_schema_type
-class QATFinetuningConfig(BaseModel):
-    """Configuration for Quantization-Aware Training (QAT) fine-tuning.
-
-    :param type: Algorithm type identifier, always "QAT"
-    :param quantizer_name: Name of the quantization algorithm to use
-    :param group_size: Size of groups for grouped quantization
-    """
-
-    type: Literal["QAT"] = "QAT"
-    quantizer_name: str
-    group_size: int
-
-
-AlgorithmConfig = Annotated[LoraFinetuningConfig | QATFinetuningConfig, Field(discriminator="type")]
-register_schema(AlgorithmConfig, name="AlgorithmConfig")
-
-
-@json_schema_type
-class PostTrainingJobLogStream(BaseModel):
-    """Stream of logs from a finetuning job.
-
-    :param job_uuid: Unique identifier for the training job
-    :param log_lines: List of log message strings from the training process
-    """
-
-    job_uuid: str
-    log_lines: list[str]
-
-
-@json_schema_type
-class RLHFAlgorithm(Enum):
-    """Available reinforcement learning from human feedback algorithms.
-    :cvar dpo: Direct Preference Optimization algorithm
-    """
-
-    dpo = "dpo"
-
-
-@json_schema_type
-class DPOLossType(Enum):
-    sigmoid = "sigmoid"
-    hinge = "hinge"
-    ipo = "ipo"
-    kto_pair = "kto_pair"
-
-
-@json_schema_type
-class DPOAlignmentConfig(BaseModel):
-    """Configuration for Direct Preference Optimization (DPO) alignment.
-
-    :param beta: Temperature parameter for the DPO loss
-    :param loss_type: The type of loss function to use for DPO
-    """
-
-    beta: float
-    loss_type: DPOLossType = DPOLossType.sigmoid
-
-
-@json_schema_type
-class PostTrainingRLHFRequest(BaseModel):
-    """Request to finetune a model using reinforcement learning from human feedback.
-
-    :param job_uuid: Unique identifier for the training job
-    :param finetuned_model: URL or path to the base model to fine-tune
-    :param dataset_id: Unique identifier for the training dataset
-    :param validation_dataset_id: Unique identifier for the validation dataset
-    :param algorithm: RLHF algorithm to use for training
-    :param algorithm_config: Configuration parameters for the RLHF algorithm
-    :param optimizer_config: Configuration parameters for the optimization algorithm
-    :param training_config: Configuration parameters for the training process
-    :param hyperparam_search_config: Configuration for hyperparameter search
-    :param logger_config: Configuration for training logging
-    """
-
-    job_uuid: str
-
-    finetuned_model: URL
-
-    dataset_id: str
-    validation_dataset_id: str
-
-    algorithm: RLHFAlgorithm
-    algorithm_config: DPOAlignmentConfig
-
-    optimizer_config: OptimizerConfig
-    training_config: TrainingConfig
-
-    # TODO: define these
-    hyperparam_search_config: dict[str, Any]
-    logger_config: dict[str, Any]
-
-
-class PostTrainingJob(BaseModel):
-    job_uuid: str
-
-
-@json_schema_type
-class PostTrainingJobStatusResponse(BaseModel):
-    """Status of a finetuning job.
-
-    :param job_uuid: Unique identifier for the training job
-    :param status: Current status of the training job
-    :param scheduled_at: (Optional) Timestamp when the job was scheduled
-    :param started_at: (Optional) Timestamp when the job execution began
-    :param completed_at: (Optional) Timestamp when the job finished, if completed
-    :param resources_allocated: (Optional) Information about computational resources allocated to the job
-    :param checkpoints: List of model checkpoints created during training
-    """
-
-    job_uuid: str
-    status: JobStatus
-
-    scheduled_at: datetime | None = None
-    started_at: datetime | None = None
-    completed_at: datetime | None = None
-
-    resources_allocated: dict[str, Any] | None = None
-
-    checkpoints: list[Checkpoint] = Field(default_factory=list)
-
-
-class ListPostTrainingJobsResponse(BaseModel):
-    data: list[PostTrainingJob]
-
-
-@json_schema_type
-class PostTrainingJobArtifactsResponse(BaseModel):
-    """Artifacts of a finetuning job.
-
-    :param job_uuid: Unique identifier for the training job
-    :param checkpoints: List of model checkpoints created during training
-    """
-
-    job_uuid: str
-    checkpoints: list[Checkpoint] = Field(default_factory=list)
-
-    # TODO(ashwin): metrics, evals
-
-
-class PostTraining(Protocol):
-    @webmethod(route="/post-training/supervised-fine-tune", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def supervised_fine_tune(
-        self,
-        job_uuid: str,
-        training_config: TrainingConfig,
-        hyperparam_search_config: dict[str, Any],
-        logger_config: dict[str, Any],
-        model: str | None = Field(
-            default=None,
-            description="Model descriptor for training if not in provider config`",
-        ),
-        checkpoint_dir: str | None = None,
-        algorithm_config: AlgorithmConfig | None = None,
-    ) -> PostTrainingJob:
-        """Run supervised fine-tuning of a model.
-
-        :param job_uuid: The UUID of the job to create.
-        :param training_config: The training configuration.
-        :param hyperparam_search_config: The hyperparam search configuration.
-        :param logger_config: The logger configuration.
-        :param model: The model to fine-tune.
-        :param checkpoint_dir: The directory to save checkpoint(s) to.
-        :param algorithm_config: The algorithm configuration.
-        :returns: A PostTrainingJob.
-        """
-        ...
-
-    @webmethod(route="/post-training/preference-optimize", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def preference_optimize(
-        self,
-        job_uuid: str,
-        finetuned_model: str,
-        algorithm_config: DPOAlignmentConfig,
-        training_config: TrainingConfig,
-        hyperparam_search_config: dict[str, Any],
-        logger_config: dict[str, Any],
-    ) -> PostTrainingJob:
-        """Run preference optimization of a model.
-
-        :param job_uuid: The UUID of the job to create.
-        :param finetuned_model: The model to fine-tune.
-        :param algorithm_config: The algorithm configuration.
-        :param training_config: The training configuration.
-        :param hyperparam_search_config: The hyperparam search configuration.
-        :param logger_config: The logger configuration.
-        :returns: A PostTrainingJob.
-        """
-        ...
-
-    @webmethod(route="/post-training/jobs", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
-        """Get all training jobs.
-
-        :returns: A ListPostTrainingJobsResponse.
-        """
-        ...
-
-    @webmethod(route="/post-training/job/status", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse:
-        """Get the status of a training job.
-
-        :param job_uuid: The UUID of the job to get the status of.
-        :returns: A PostTrainingJobStatusResponse.
-        """
-        ...
-
-    @webmethod(route="/post-training/job/cancel", method="POST", level=LLAMA_STACK_API_V1ALPHA)
-    async def cancel_training_job(self, job_uuid: str) -> None:
-        """Cancel a training job.
-
-        :param job_uuid: The UUID of the job to cancel.
-        """
-        ...
-
-    @webmethod(route="/post-training/job/artifacts", method="GET", level=LLAMA_STACK_API_V1ALPHA)
-    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
-        """Get the artifacts of a training job.
-
-        :param job_uuid: The UUID of the job to get the artifacts of.
-        :returns: A PostTrainingJobArtifactsResponse.
-        """
-        ...
--- a/src/llama_stack/apis/prompts/init.py
+++ b/src/llama_stack/apis/prompts/init.py
@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .prompts import ListPromptsResponse, Prompt, Prompts
-
-__all__ = ["Prompt", "Prompts", "ListPromptsResponse"]
--- a/src/llama_stack/apis/prompts/prompts.py
+++ b/src/llama_stack/apis/prompts/prompts.py
@ -1,204 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import re
-import secrets
-from typing import Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field, field_validator, model_validator
-
-from llama_stack.apis.common.tracing import telemetry_traceable
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-@json_schema_type
-class Prompt(BaseModel):
-    """A prompt resource representing a stored OpenAI Compatible prompt template in Llama Stack.
-
-    :param prompt: The system prompt text with variable placeholders. Variables are only supported when using the Responses API.
-    :param version: Version (integer starting at 1, incremented on save)
-    :param prompt_id: Unique identifier formatted as 'pmpt_<48-digit-hash>'
-    :param variables: List of prompt variable names that can be used in the prompt template
-    :param is_default: Boolean indicating whether this version is the default version for this prompt
-    """
-
-    prompt: str | None = Field(default=None, description="The system prompt with variable placeholders")
-    version: int = Field(description="Version (integer starting at 1, incremented on save)", ge=1)
-    prompt_id: str = Field(description="Unique identifier in format 'pmpt_<48-digit-hash>'")
-    variables: list[str] = Field(
-        default_factory=list, description="List of variable names that can be used in the prompt template"
-    )
-    is_default: bool = Field(
-        default=False, description="Boolean indicating whether this version is the default version"
-    )
-
-    @field_validator("prompt_id")
-    @classmethod
-    def validate_prompt_id(cls, prompt_id: str) -> str:
-        if not isinstance(prompt_id, str):
-            raise TypeError("prompt_id must be a string in format 'pmpt_<48-digit-hash>'")
-
-        if not prompt_id.startswith("pmpt_"):
-            raise ValueError("prompt_id must start with 'pmpt_' prefix")
-
-        hex_part = prompt_id[5:]
-        if len(hex_part) != 48:
-            raise ValueError("prompt_id must be in format 'pmpt_<48-digit-hash>' (48 lowercase hex chars)")
-
-        for char in hex_part:
-            if char not in "0123456789abcdef":
-                raise ValueError("prompt_id hex part must contain only lowercase hex characters [0-9a-f]")
-
-        return prompt_id
-
-    @field_validator("version")
-    @classmethod
-    def validate_version(cls, prompt_version: int) -> int:
-        if prompt_version < 1:
-            raise ValueError("version must be >= 1")
-        return prompt_version
-
-    @model_validator(mode="after")
-    def validate_prompt_variables(self):
-        """Validate that all variables used in the prompt are declared in the variables list."""
-        if not self.prompt:
-            return self
-
-        prompt_variables = set(re.findall(r"{{\s*(\w+)\s*}}", self.prompt))
-        declared_variables = set(self.variables)
-
-        undeclared = prompt_variables - declared_variables
-        if undeclared:
-            raise ValueError(f"Prompt contains undeclared variables: {sorted(undeclared)}")
-
-        return self
-
-    @classmethod
-    def generate_prompt_id(cls) -> str:
-        # Generate 48 hex characters (24 bytes)
-        random_bytes = secrets.token_bytes(24)
-        hex_string = random_bytes.hex()
-        return f"pmpt_{hex_string}"
-
-
-class ListPromptsResponse(BaseModel):
-    """Response model to list prompts."""
-
-    data: list[Prompt]
-
-
-@runtime_checkable
-@telemetry_traceable
-class Prompts(Protocol):
-    """Prompts
-
-    Protocol for prompt management operations."""
-
-    @webmethod(route="/prompts", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_prompts(self) -> ListPromptsResponse:
-        """List all prompts.
-
-        :returns: A ListPromptsResponse containing all prompts.
-        """
-        ...
-
-    @webmethod(route="/prompts/{prompt_id}/versions", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_prompt_versions(
-        self,
-        prompt_id: str,
-    ) -> ListPromptsResponse:
-        """List prompt versions.
-
-        List all versions of a specific prompt.
-
-        :param prompt_id: The identifier of the prompt to list versions for.
-        :returns: A ListPromptsResponse containing all versions of the prompt.
-        """
-        ...
-
-    @webmethod(route="/prompts/{prompt_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_prompt(
-        self,
-        prompt_id: str,
-        version: int | None = None,
-    ) -> Prompt:
-        """Get prompt.
-
-        Get a prompt by its identifier and optional version.
-
-        :param prompt_id: The identifier of the prompt to get.
-        :param version: The version of the prompt to get (defaults to latest).
-        :returns: A Prompt resource.
-        """
-        ...
-
-    @webmethod(route="/prompts", method="POST", level=LLAMA_STACK_API_V1)
-    async def create_prompt(
-        self,
-        prompt: str,
-        variables: list[str] | None = None,
-    ) -> Prompt:
-        """Create prompt.
-
-        Create a new prompt.
-
-        :param prompt: The prompt text content with variable placeholders.
-        :param variables: List of variable names that can be used in the prompt template.
-        :returns: The created Prompt resource.
-        """
-        ...
-
-    @webmethod(route="/prompts/{prompt_id}", method="PUT", level=LLAMA_STACK_API_V1)
-    async def update_prompt(
-        self,
-        prompt_id: str,
-        prompt: str,
-        version: int,
-        variables: list[str] | None = None,
-        set_as_default: bool = True,
-    ) -> Prompt:
-        """Update prompt.
-
-        Update an existing prompt (increments version).
-
-        :param prompt_id: The identifier of the prompt to update.
-        :param prompt: The updated prompt text content.
-        :param version: The current version of the prompt being updated.
-        :param variables: Updated list of variable names that can be used in the prompt template.
-        :param set_as_default: Set the new version as the default (default=True).
-        :returns: The updated Prompt resource with incremented version.
-        """
-        ...
-
-    @webmethod(route="/prompts/{prompt_id}", method="DELETE", level=LLAMA_STACK_API_V1)
-    async def delete_prompt(
-        self,
-        prompt_id: str,
-    ) -> None:
-        """Delete prompt.
-
-        Delete a prompt.
-
-        :param prompt_id: The identifier of the prompt to delete.
-        """
-        ...
-
-    @webmethod(route="/prompts/{prompt_id}/set-default-version", method="PUT", level=LLAMA_STACK_API_V1)
-    async def set_default_version(
-        self,
-        prompt_id: str,
-        version: int,
-    ) -> Prompt:
-        """Set prompt version.
-
-        Set which version of a prompt should be the default in get_prompt (latest).
-
-        :param prompt_id: The identifier of the prompt.
-        :param version: The version to set as default.
-        :returns: The prompt with the specified version now set as default.
-        """
-        ...
--- a/src/llama_stack/apis/providers/init.py
+++ b/src/llama_stack/apis/providers/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .providers import *
--- a/src/llama_stack/apis/providers/providers.py
+++ b/src/llama_stack/apis/providers/providers.py
@ -1,69 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Protocol, runtime_checkable
-
-from pydantic import BaseModel
-
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.providers.datatypes import HealthResponse
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-@json_schema_type
-class ProviderInfo(BaseModel):
-    """Information about a registered provider including its configuration and health status.
-
-    :param api: The API name this provider implements
-    :param provider_id: Unique identifier for the provider
-    :param provider_type: The type of provider implementation
-    :param config: Configuration parameters for the provider
-    :param health: Current health status of the provider
-    """
-
-    api: str
-    provider_id: str
-    provider_type: str
-    config: dict[str, Any]
-    health: HealthResponse
-
-
-class ListProvidersResponse(BaseModel):
-    """Response containing a list of all available providers.
-
-    :param data: List of provider information objects
-    """
-
-    data: list[ProviderInfo]
-
-
-@runtime_checkable
-class Providers(Protocol):
-    """Providers
-
-    Providers API for inspecting, listing, and modifying providers and their configurations.
-    """
-
-    @webmethod(route="/providers", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_providers(self) -> ListProvidersResponse:
-        """List providers.
-
-        List all available providers.
-
-        :returns: A ListProvidersResponse containing information about all providers.
-        """
-        ...
-
-    @webmethod(route="/providers/{provider_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def inspect_provider(self, provider_id: str) -> ProviderInfo:
-        """Get provider.
-
-        Get detailed information about a specific provider.
-
-        :param provider_id: The ID of the provider to inspect.
-        :returns: A ProviderInfo object containing the provider's details.
-        """
-        ...
--- a/src/llama_stack/apis/resource.py
+++ b/src/llama_stack/apis/resource.py
@ -1,37 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from enum import StrEnum
-
-from pydantic import BaseModel, Field
-
-
-class ResourceType(StrEnum):
-    model = "model"
-    shield = "shield"
-    vector_store = "vector_store"
-    dataset = "dataset"
-    scoring_function = "scoring_function"
-    benchmark = "benchmark"
-    tool = "tool"
-    tool_group = "tool_group"
-    prompt = "prompt"
-
-
-class Resource(BaseModel):
-    """Base class for all Llama Stack resources"""
-
-    identifier: str = Field(description="Unique identifier for this resource in llama stack")
-
-    provider_resource_id: str | None = Field(
-        default=None,
-        description="Unique identifier for this resource in the provider",
-    )
-
-    provider_id: str = Field(description="ID of the provider that owns this resource")
-
-    type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_store', etc.)")
--- a/src/llama_stack/apis/safety/init.py
+++ b/src/llama_stack/apis/safety/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .safety import *
--- a/src/llama_stack/apis/safety/safety.py
+++ b/src/llama_stack/apis/safety/safety.py
@ -1,134 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any, Protocol, runtime_checkable
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.common.tracing import telemetry_traceable
-from llama_stack.apis.inference import OpenAIMessageParam
-from llama_stack.apis.shields import Shield
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-@json_schema_type
-class ModerationObjectResults(BaseModel):
-    """A moderation object.
-    :param flagged: Whether any of the below categories are flagged.
-    :param categories: A list of the categories, and whether they are flagged or not.
-    :param category_applied_input_types: A list of the categories along with the input type(s) that the score applies to.
-    :param category_scores: A list of the categories along with their scores as predicted by model.
-    """
-
-    flagged: bool
-    categories: dict[str, bool] | None = None
-    category_applied_input_types: dict[str, list[str]] | None = None
-    category_scores: dict[str, float] | None = None
-    user_message: str | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class ModerationObject(BaseModel):
-    """A moderation object.
-    :param id: The unique identifier for the moderation request.
-    :param model: The model used to generate the moderation results.
-    :param results: A list of moderation objects
-    """
-
-    id: str
-    model: str
-    results: list[ModerationObjectResults]
-
-
-@json_schema_type
-class ViolationLevel(Enum):
-    """Severity level of a safety violation.
-
-    :cvar INFO: Informational level violation that does not require action
-    :cvar WARN: Warning level violation that suggests caution but allows continuation
-    :cvar ERROR: Error level violation that requires blocking or intervention
-    """
-
-    INFO = "info"
-    WARN = "warn"
-    ERROR = "error"
-
-
-@json_schema_type
-class SafetyViolation(BaseModel):
-    """Details of a safety violation detected by content moderation.
-
-    :param violation_level: Severity level of the violation
-    :param user_message: (Optional) Message to convey to the user about the violation
-    :param metadata: Additional metadata including specific violation codes for debugging and telemetry
-    """
-
-    violation_level: ViolationLevel
-
-    # what message should you convey to the user
-    user_message: str | None = None
-
-    # additional metadata (including specific violation codes) more for
-    # debugging, telemetry
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class RunShieldResponse(BaseModel):
-    """Response from running a safety shield.
-
-    :param violation: (Optional) Safety violation detected by the shield, if any
-    """
-
-    violation: SafetyViolation | None = None
-
-
-class ShieldStore(Protocol):
-    async def get_shield(self, identifier: str) -> Shield: ...
-
-
-@runtime_checkable
-@telemetry_traceable
-class Safety(Protocol):
-    """Safety
-
-    OpenAI-compatible Moderations API.
-    """
-
-    shield_store: ShieldStore
-
-    @webmethod(route="/safety/run-shield", method="POST", level=LLAMA_STACK_API_V1)
-    async def run_shield(
-        self,
-        shield_id: str,
-        messages: list[OpenAIMessageParam],
-        params: dict[str, Any],
-    ) -> RunShieldResponse:
-        """Run shield.
-
-        Run a shield.
-
-        :param shield_id: The identifier of the shield to run.
-        :param messages: The messages to run the shield on.
-        :param params: The parameters of the shield.
-        :returns: A RunShieldResponse.
-        """
-        ...
-
-    @webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1)
-    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
-        """Create moderation.
-
-        Classifies if text and/or image inputs are potentially harmful.
-        :param input: Input (or inputs) to classify.
-        Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
-        :param model: (Optional) The content moderation model you would like to use.
-        :returns: A moderation object.
-        """
-        ...
--- a/src/llama_stack/apis/scoring/init.py
+++ b/src/llama_stack/apis/scoring/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .scoring import *
--- a/src/llama_stack/apis/scoring/scoring.py
+++ b/src/llama_stack/apis/scoring/scoring.py
@ -1,93 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Protocol, runtime_checkable
-
-from pydantic import BaseModel
-
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-# mapping of metric to value
-ScoringResultRow = dict[str, Any]
-
-
-@json_schema_type
-class ScoringResult(BaseModel):
-    """
-    A scoring result for a single row.
-
-    :param score_rows: The scoring result for each row. Each row is a map of column name to value.
-    :param aggregated_results: Map of metric name to aggregated value
-    """
-
-    score_rows: list[ScoringResultRow]
-    # aggregated metrics to value
-    aggregated_results: dict[str, Any]
-
-
-@json_schema_type
-class ScoreBatchResponse(BaseModel):
-    """Response from batch scoring operations on datasets.
-
-    :param dataset_id: (Optional) The identifier of the dataset that was scored
-    :param results: A map of scoring function name to ScoringResult
-    """
-
-    dataset_id: str | None = None
-    results: dict[str, ScoringResult]
-
-
-@json_schema_type
-class ScoreResponse(BaseModel):
-    """
-    The response from scoring.
-
-    :param results: A map of scoring function name to ScoringResult.
-    """
-
-    # each key in the dict is a scoring function name
-    results: dict[str, ScoringResult]
-
-
-class ScoringFunctionStore(Protocol):
-    def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn: ...
-
-
-@runtime_checkable
-class Scoring(Protocol):
-    scoring_function_store: ScoringFunctionStore
-
-    @webmethod(route="/scoring/score-batch", method="POST", level=LLAMA_STACK_API_V1)
-    async def score_batch(
-        self,
-        dataset_id: str,
-        scoring_functions: dict[str, ScoringFnParams | None],
-        save_results_dataset: bool = False,
-    ) -> ScoreBatchResponse:
-        """Score a batch of rows.
-
-        :param dataset_id: The ID of the dataset to score.
-        :param scoring_functions: The scoring functions to use for the scoring.
-        :param save_results_dataset: Whether to save the results to a dataset.
-        :returns: A ScoreBatchResponse.
-        """
-        ...
-
-    @webmethod(route="/scoring/score", method="POST", level=LLAMA_STACK_API_V1)
-    async def score(
-        self,
-        input_rows: list[dict[str, Any]],
-        scoring_functions: dict[str, ScoringFnParams | None],
-    ) -> ScoreResponse:
-        """Score a list of rows.
-
-        :param input_rows: The rows to score.
-        :param scoring_functions: The scoring functions to use for the scoring.
-        :returns: A ScoreResponse object containing rows and aggregated results.
-        """
-        ...
--- a/src/llama_stack/apis/scoring_functions/init.py
+++ b/src/llama_stack/apis/scoring_functions/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .scoring_functions import *
--- a/src/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/src/llama_stack/apis/scoring_functions/scoring_functions.py
@ -1,210 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# TODO: use enum.StrEnum when we drop support for python 3.10
-from enum import StrEnum
-from typing import (
-    Annotated,
-    Any,
-    Literal,
-    Protocol,
-    runtime_checkable,
-)
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.common.type_system import ParamType
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
-
-
-# Perhaps more structure can be imposed on these functions. Maybe they could be associated
-# with standard metrics so they can be rolled up?
-@json_schema_type
-class ScoringFnParamsType(StrEnum):
-    """Types of scoring function parameter configurations.
-    :cvar llm_as_judge: Use an LLM model to evaluate and score responses
-    :cvar regex_parser: Use regex patterns to extract and score specific parts of responses
-    :cvar basic: Basic scoring with simple aggregation functions
-    """
-
-    llm_as_judge = "llm_as_judge"
-    regex_parser = "regex_parser"
-    basic = "basic"
-
-
-@json_schema_type
-class AggregationFunctionType(StrEnum):
-    """Types of aggregation functions for scoring results.
-    :cvar average: Calculate the arithmetic mean of scores
-    :cvar weighted_average: Calculate a weighted average of scores
-    :cvar median: Calculate the median value of scores
-    :cvar categorical_count: Count occurrences of categorical values
-    :cvar accuracy: Calculate accuracy as the proportion of correct answers
-    """
-
-    average = "average"
-    weighted_average = "weighted_average"
-    median = "median"
-    categorical_count = "categorical_count"
-    accuracy = "accuracy"
-
-
-@json_schema_type
-class LLMAsJudgeScoringFnParams(BaseModel):
-    """Parameters for LLM-as-judge scoring function configuration.
-    :param type: The type of scoring function parameters, always llm_as_judge
-    :param judge_model: Identifier of the LLM model to use as a judge for scoring
-    :param prompt_template: (Optional) Custom prompt template for the judge model
-    :param judge_score_regexes: Regexes to extract the answer from generated response
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge
-    judge_model: str
-    prompt_template: str | None = None
-    judge_score_regexes: list[str] = Field(
-        description="Regexes to extract the answer from generated response",
-        default_factory=lambda: [],
-    )
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=lambda: [],
-    )
-
-
-@json_schema_type
-class RegexParserScoringFnParams(BaseModel):
-    """Parameters for regex parser scoring function configuration.
-    :param type: The type of scoring function parameters, always regex_parser
-    :param parsing_regexes: Regex to extract the answer from generated response
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser
-    parsing_regexes: list[str] = Field(
-        description="Regex to extract the answer from generated response",
-        default_factory=lambda: [],
-    )
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=lambda: [],
-    )
-
-
-@json_schema_type
-class BasicScoringFnParams(BaseModel):
-    """Parameters for basic scoring function configuration.
-    :param type: The type of scoring function parameters, always basic
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
-    )
-
-
-ScoringFnParams = Annotated[
-    LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams,
-    Field(discriminator="type"),
-]
-register_schema(ScoringFnParams, name="ScoringFnParams")
-
-
-class CommonScoringFnFields(BaseModel):
-    description: str | None = None
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this definition",
-    )
-    return_type: ParamType = Field(
-        description="The return type of the deterministic function",
-    )
-    params: ScoringFnParams | None = Field(
-        description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval",
-        default=None,
-    )
-
-
-@json_schema_type
-class ScoringFn(CommonScoringFnFields, Resource):
-    """A scoring function resource for evaluating model outputs.
-    :param type: The resource type, always scoring_function
-    """
-
-    type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function
-
-    @property
-    def scoring_fn_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_scoring_fn_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class ScoringFnInput(CommonScoringFnFields, BaseModel):
-    scoring_fn_id: str
-    provider_id: str | None = None
-    provider_scoring_fn_id: str | None = None
-
-
-class ListScoringFunctionsResponse(BaseModel):
-    data: list[ScoringFn]
-
-
-@runtime_checkable
-class ScoringFunctions(Protocol):
-    @webmethod(route="/scoring-functions", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
-        """List all scoring functions.
-
-        :returns: A ListScoringFunctionsResponse.
-        """
-        ...
-
-    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn:
-        """Get a scoring function by its ID.
-
-        :param scoring_fn_id: The ID of the scoring function to get.
-        :returns: A ScoringFn.
-        """
-        ...
-
-    @webmethod(route="/scoring-functions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
-    async def register_scoring_function(
-        self,
-        scoring_fn_id: str,
-        description: str,
-        return_type: ParamType,
-        provider_scoring_fn_id: str | None = None,
-        provider_id: str | None = None,
-        params: ScoringFnParams | None = None,
-    ) -> None:
-        """Register a scoring function.
-
-        :param scoring_fn_id: The ID of the scoring function to register.
-        :param description: The description of the scoring function.
-        :param return_type: The return type of the scoring function.
-        :param provider_scoring_fn_id: The ID of the provider scoring function to use for the scoring function.
-        :param provider_id: The ID of the provider to use for the scoring function.
-        :param params: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-        """
-        ...
-
-    @webmethod(
-        route="/scoring-functions/{scoring_fn_id:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True
-    )
-    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
-        """Unregister a scoring function.
-
-        :param scoring_fn_id: The ID of the scoring function to unregister.
-        """
-        ...
--- a/src/llama_stack/apis/shields/init.py
+++ b/src/llama_stack/apis/shields/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .shields import *
--- a/src/llama_stack/apis/shields/shields.py
+++ b/src/llama_stack/apis/shields/shields.py
@ -1,94 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Literal, Protocol, runtime_checkable
-
-from pydantic import BaseModel
-
-from llama_stack.apis.common.tracing import telemetry_traceable
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-class CommonShieldFields(BaseModel):
-    params: dict[str, Any] | None = None
-
-
-@json_schema_type
-class Shield(CommonShieldFields, Resource):
-    """A safety shield resource that can be used to check content.
-
-    :param params: (Optional) Configuration parameters for the shield
-    :param type: The resource type, always shield
-    """
-
-    type: Literal[ResourceType.shield] = ResourceType.shield
-
-    @property
-    def shield_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_shield_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class ShieldInput(CommonShieldFields):
-    shield_id: str
-    provider_id: str | None = None
-    provider_shield_id: str | None = None
-
-
-class ListShieldsResponse(BaseModel):
-    data: list[Shield]
-
-
-@runtime_checkable
-@telemetry_traceable
-class Shields(Protocol):
-    @webmethod(route="/shields", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_shields(self) -> ListShieldsResponse:
-        """List all shields.
-
-        :returns: A ListShieldsResponse.
-        """
-        ...
-
-    @webmethod(route="/shields/{identifier:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_shield(self, identifier: str) -> Shield:
-        """Get a shield by its identifier.
-
-        :param identifier: The identifier of the shield to get.
-        :returns: A Shield.
-        """
-        ...
-
-    @webmethod(route="/shields", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
-    async def register_shield(
-        self,
-        shield_id: str,
-        provider_shield_id: str | None = None,
-        provider_id: str | None = None,
-        params: dict[str, Any] | None = None,
-    ) -> Shield:
-        """Register a shield.
-
-        :param shield_id: The identifier of the shield to register.
-        :param provider_shield_id: The identifier of the shield in the provider.
-        :param provider_id: The identifier of the provider.
-        :param params: The parameters of the shield.
-        :returns: A Shield.
-        """
-        ...
-
-    @webmethod(route="/shields/{identifier:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
-    async def unregister_shield(self, identifier: str) -> None:
-        """Unregister a shield.
-
-        :param identifier: The identifier of the shield to unregister.
-        """
-        ...
--- a/src/llama_stack/apis/tools/init.py
+++ b/src/llama_stack/apis/tools/init.py
@ -1,8 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .rag_tool import *
-from .tools import *
--- a/src/llama_stack/apis/tools/rag_tool.py
+++ b/src/llama_stack/apis/tools/rag_tool.py
@ -1,168 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum, StrEnum
-from typing import Annotated, Any, Literal
-
-from pydantic import BaseModel, Field, field_validator
-
-from llama_stack.apis.common.content_types import URL, InterleavedContent
-
-
-class RRFRanker(BaseModel):
-    """
-    Reciprocal Rank Fusion (RRF) ranker configuration.
-
-    :param type: The type of ranker, always "rrf"
-    :param impact_factor: The impact factor for RRF scoring. Higher values give more weight to higher-ranked results.
-                         Must be greater than 0
-    """
-
-    type: Literal["rrf"] = "rrf"
-    impact_factor: float = Field(default=60.0, gt=0.0)  # default of 60 for optimal performance
-
-
-class WeightedRanker(BaseModel):
-    """
-    Weighted ranker configuration that combines vector and keyword scores.
-
-    :param type: The type of ranker, always "weighted"
-    :param alpha: Weight factor between 0 and 1.
-                 0 means only use keyword scores,
-                 1 means only use vector scores,
-                 values in between blend both scores.
-    """
-
-    type: Literal["weighted"] = "weighted"
-    alpha: float = Field(
-        default=0.5,
-        ge=0.0,
-        le=1.0,
-        description="Weight factor between 0 and 1. 0 means only keyword scores, 1 means only vector scores.",
-    )
-
-
-Ranker = Annotated[
-    RRFRanker | WeightedRanker,
-    Field(discriminator="type"),
-]
-
-
-class RAGDocument(BaseModel):
-    """
-    A document to be used for document ingestion in the RAG Tool.
-
-    :param document_id: The unique identifier for the document.
-    :param content: The content of the document.
-    :param mime_type: The MIME type of the document.
-    :param metadata: Additional metadata for the document.
-    """
-
-    document_id: str
-    content: InterleavedContent | URL
-    mime_type: str | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-class RAGQueryResult(BaseModel):
-    """Result of a RAG query containing retrieved content and metadata.
-
-    :param content: (Optional) The retrieved content from the query
-    :param metadata: Additional metadata about the query result
-    """
-
-    content: InterleavedContent | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-class RAGQueryGenerator(Enum):
-    """Types of query generators for RAG systems.
-
-    :cvar default: Default query generator using simple text processing
-    :cvar llm: LLM-based query generator for enhanced query understanding
-    :cvar custom: Custom query generator implementation
-    """
-
-    default = "default"
-    llm = "llm"
-    custom = "custom"
-
-
-class RAGSearchMode(StrEnum):
-    """
-    Search modes for RAG query retrieval:
-    - VECTOR: Uses vector similarity search for semantic matching
-    - KEYWORD: Uses keyword-based search for exact matching
-    - HYBRID: Combines both vector and keyword search for better results
-    """
-
-    VECTOR = "vector"
-    KEYWORD = "keyword"
-    HYBRID = "hybrid"
-
-
-class DefaultRAGQueryGeneratorConfig(BaseModel):
-    """Configuration for the default RAG query generator.
-
-    :param type: Type of query generator, always 'default'
-    :param separator: String separator used to join query terms
-    """
-
-    type: Literal["default"] = "default"
-    separator: str = " "
-
-
-class LLMRAGQueryGeneratorConfig(BaseModel):
-    """Configuration for the LLM-based RAG query generator.
-
-    :param type: Type of query generator, always 'llm'
-    :param model: Name of the language model to use for query generation
-    :param template: Template string for formatting the query generation prompt
-    """
-
-    type: Literal["llm"] = "llm"
-    model: str
-    template: str
-
-
-RAGQueryGeneratorConfig = Annotated[
-    DefaultRAGQueryGeneratorConfig | LLMRAGQueryGeneratorConfig,
-    Field(discriminator="type"),
-]
-
-
-class RAGQueryConfig(BaseModel):
-    """
-    Configuration for the RAG query generation.
-
-    :param query_generator_config: Configuration for the query generator.
-    :param max_tokens_in_context: Maximum number of tokens in the context.
-    :param max_chunks: Maximum number of chunks to retrieve.
-    :param chunk_template: Template for formatting each retrieved chunk in the context.
-        Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict).
-        Default: "Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n"
-    :param mode: Search mode for retrieval—either "vector", "keyword", or "hybrid". Default "vector".
-    :param ranker: Configuration for the ranker to use in hybrid search. Defaults to RRF ranker.
-    """
-
-    # This config defines how a query is generated using the messages
-    # for memory bank retrieval.
-    query_generator_config: RAGQueryGeneratorConfig = Field(default=DefaultRAGQueryGeneratorConfig())
-    max_tokens_in_context: int = 4096
-    max_chunks: int = 5
-    chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
-    mode: RAGSearchMode | None = RAGSearchMode.VECTOR
-    ranker: Ranker | None = Field(default=None)  # Only used for hybrid mode
-
-    @field_validator("chunk_template")
-    def validate_chunk_template(cls, v: str) -> str:
-        if "{chunk.content}" not in v:
-            raise ValueError("chunk_template must contain {chunk.content}")
-        if "{index}" not in v:
-            raise ValueError("chunk_template must contain {index}")
-        if len(v) == 0:
-            raise ValueError("chunk_template must not be empty")
-        return v
--- a/src/llama_stack/apis/tools/tools.py
+++ b/src/llama_stack/apis/tools/tools.py
@ -1,217 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any, Literal, Protocol
-
-from pydantic import BaseModel
-from typing_extensions import runtime_checkable
-
-from llama_stack.apis.common.content_types import URL, InterleavedContent
-from llama_stack.apis.common.tracing import telemetry_traceable
-from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-
-
-@json_schema_type
-class ToolDef(BaseModel):
-    """Tool definition used in runtime contexts.
-
-    :param name: Name of the tool
-    :param description: (Optional) Human-readable description of what the tool does
-    :param input_schema: (Optional) JSON Schema for tool inputs (MCP inputSchema)
-    :param output_schema: (Optional) JSON Schema for tool outputs (MCP outputSchema)
-    :param metadata: (Optional) Additional metadata about the tool
-    :param toolgroup_id: (Optional) ID of the tool group this tool belongs to
-    """
-
-    toolgroup_id: str | None = None
-    name: str
-    description: str | None = None
-    input_schema: dict[str, Any] | None = None
-    output_schema: dict[str, Any] | None = None
-    metadata: dict[str, Any] | None = None
-
-
-@json_schema_type
-class ToolGroupInput(BaseModel):
-    """Input data for registering a tool group.
-
-    :param toolgroup_id: Unique identifier for the tool group
-    :param provider_id: ID of the provider that will handle this tool group
-    :param args: (Optional) Additional arguments to pass to the provider
-    :param mcp_endpoint: (Optional) Model Context Protocol endpoint for remote tools
-    """
-
-    toolgroup_id: str
-    provider_id: str
-    args: dict[str, Any] | None = None
-    mcp_endpoint: URL | None = None
-
-
-@json_schema_type
-class ToolGroup(Resource):
-    """A group of related tools managed together.
-
-    :param type: Type of resource, always 'tool_group'
-    :param mcp_endpoint: (Optional) Model Context Protocol endpoint for remote tools
-    :param args: (Optional) Additional arguments for the tool group
-    """
-
-    type: Literal[ResourceType.tool_group] = ResourceType.tool_group
-    mcp_endpoint: URL | None = None
-    args: dict[str, Any] | None = None
-
-
-@json_schema_type
-class ToolInvocationResult(BaseModel):
-    """Result of a tool invocation.
-
-    :param content: (Optional) The output content from the tool execution
-    :param error_message: (Optional) Error message if the tool execution failed
-    :param error_code: (Optional) Numeric error code if the tool execution failed
-    :param metadata: (Optional) Additional metadata about the tool execution
-    """
-
-    content: InterleavedContent | None = None
-    error_message: str | None = None
-    error_code: int | None = None
-    metadata: dict[str, Any] | None = None
-
-
-class ToolStore(Protocol):
-    async def get_tool(self, tool_name: str) -> ToolDef: ...
-    async def get_tool_group(self, toolgroup_id: str) -> ToolGroup: ...
-
-
-class ListToolGroupsResponse(BaseModel):
-    """Response containing a list of tool groups.
-
-    :param data: List of tool groups
-    """
-
-    data: list[ToolGroup]
-
-
-class ListToolDefsResponse(BaseModel):
-    """Response containing a list of tool definitions.
-
-    :param data: List of tool definitions
-    """
-
-    data: list[ToolDef]
-
-
-@runtime_checkable
-@telemetry_traceable
-class ToolGroups(Protocol):
-    @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
-    async def register_tool_group(
-        self,
-        toolgroup_id: str,
-        provider_id: str,
-        mcp_endpoint: URL | None = None,
-        args: dict[str, Any] | None = None,
-    ) -> None:
-        """Register a tool group.
-
-        :param toolgroup_id: The ID of the tool group to register.
-        :param provider_id: The ID of the provider to use for the tool group.
-        :param mcp_endpoint: The MCP endpoint to use for the tool group.
-        :param args: A dictionary of arguments to pass to the tool group.
-        """
-        ...
-
-    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_tool_group(
-        self,
-        toolgroup_id: str,
-    ) -> ToolGroup:
-        """Get a tool group by its ID.
-
-        :param toolgroup_id: The ID of the tool group to get.
-        :returns: A ToolGroup.
-        """
-        ...
-
-    @webmethod(route="/toolgroups", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_tool_groups(self) -> ListToolGroupsResponse:
-        """List tool groups with optional provider.
-
-        :returns: A ListToolGroupsResponse.
-        """
-        ...
-
-    @webmethod(route="/tools", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolDefsResponse:
-        """List tools with optional tool group.
-
-        :param toolgroup_id: The ID of the tool group to list tools for.
-        :returns: A ListToolDefsResponse.
-        """
-        ...
-
-    @webmethod(route="/tools/{tool_name:path}", method="GET", level=LLAMA_STACK_API_V1)
-    async def get_tool(
-        self,
-        tool_name: str,
-    ) -> ToolDef:
-        """Get a tool by its name.
-
-        :param tool_name: The name of the tool to get.
-        :returns: A ToolDef.
-        """
-        ...
-
-    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
-    async def unregister_toolgroup(
-        self,
-        toolgroup_id: str,
-    ) -> None:
-        """Unregister a tool group.
-
-        :param toolgroup_id: The ID of the tool group to unregister.
-        """
-        ...
-
-
-class SpecialToolGroup(Enum):
-    """Special tool groups with predefined functionality.
-
-    :cvar rag_tool: Retrieval-Augmented Generation tool group for document search and retrieval
-    """
-
-    rag_tool = "rag_tool"
-
-
-@runtime_checkable
-@telemetry_traceable
-class ToolRuntime(Protocol):
-    tool_store: ToolStore | None = None
-
-    # TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed.
-    @webmethod(route="/tool-runtime/list-tools", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_runtime_tools(
-        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
-    ) -> ListToolDefsResponse:
-        """List all tools in the runtime.
-
-        :param tool_group_id: The ID of the tool group to list tools for.
-        :param mcp_endpoint: The MCP endpoint to use for the tool group.
-        :returns: A ListToolDefsResponse.
-        """
-        ...
-
-    @webmethod(route="/tool-runtime/invoke", method="POST", level=LLAMA_STACK_API_V1)
-    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
-        """Run a tool with the given arguments.
-
-        :param tool_name: The name of the tool to invoke.
-        :param kwargs: A dictionary of arguments to pass to the tool.
-        :returns: A ToolInvocationResult.
-        """
-        ...
--- a/src/llama_stack/apis/vector_io/init.py
+++ b/src/llama_stack/apis/vector_io/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .vector_io import *
--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -1,872 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Annotated, Any, Literal, Protocol, runtime_checkable
-
-from fastapi import Body, Query
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.common.tracing import telemetry_traceable
-from llama_stack.apis.inference import InterleavedContent
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.schema_utils import json_schema_type, webmethod
-from llama_stack.strong_typing.schema import register_schema
-
-
-@json_schema_type
-class ChunkMetadata(BaseModel):
-    """
-    `ChunkMetadata` is backend metadata for a `Chunk` that is used to store additional information about the chunk that
-        will not be used in the context during inference, but is required for backend functionality. The `ChunkMetadata`
-        is set during chunk creation in `MemoryToolRuntimeImpl().insert()`and is not expected to change after.
-        Use `Chunk.metadata` for metadata that will be used in the context during inference.
-    :param chunk_id: The ID of the chunk. If not set, it will be generated based on the document ID and content.
-    :param document_id: The ID of the document this chunk belongs to.
-    :param source: The source of the content, such as a URL, file path, or other identifier.
-    :param created_timestamp: An optional timestamp indicating when the chunk was created.
-    :param updated_timestamp: An optional timestamp indicating when the chunk was last updated.
-    :param chunk_window: The window of the chunk, which can be used to group related chunks together.
-    :param chunk_tokenizer: The tokenizer used to create the chunk. Default is Tiktoken.
-    :param chunk_embedding_model: The embedding model used to create the chunk's embedding.
-    :param chunk_embedding_dimension: The dimension of the embedding vector for the chunk.
-    :param content_token_count: The number of tokens in the content of the chunk.
-    :param metadata_token_count: The number of tokens in the metadata of the chunk.
-    """
-
-    chunk_id: str | None = None
-    document_id: str | None = None
-    source: str | None = None
-    created_timestamp: int | None = None
-    updated_timestamp: int | None = None
-    chunk_window: str | None = None
-    chunk_tokenizer: str | None = None
-    chunk_embedding_model: str | None = None
-    chunk_embedding_dimension: int | None = None
-    content_token_count: int | None = None
-    metadata_token_count: int | None = None
-
-
-@json_schema_type
-class Chunk(BaseModel):
-    """
-    A chunk of content that can be inserted into a vector database.
-    :param content: The content of the chunk, which can be interleaved text, images, or other types.
-    :param chunk_id: Unique identifier for the chunk. Must be provided explicitly.
-    :param metadata: Metadata associated with the chunk that will be used in the model context during inference.
-    :param embedding: Optional embedding for the chunk. If not provided, it will be computed later.
-    :param chunk_metadata: Metadata for the chunk that will NOT be used in the context during inference.
-        The `chunk_metadata` is required backend functionality.
-    """
-
-    content: InterleavedContent
-    chunk_id: str
-    metadata: dict[str, Any] = Field(default_factory=dict)
-    embedding: list[float] | None = None
-    chunk_metadata: ChunkMetadata | None = None
-
-    @property
-    def document_id(self) -> str | None:
-        """Returns the document_id from either metadata or chunk_metadata, with metadata taking precedence."""
-        # Check metadata first (takes precedence)
-        doc_id = self.metadata.get("document_id")
-        if doc_id is not None:
-            if not isinstance(doc_id, str):
-                raise TypeError(f"metadata['document_id'] must be a string, got {type(doc_id).__name__}: {doc_id!r}")
-            return doc_id
-
-        # Fall back to chunk_metadata if available (Pydantic ensures type safety)
-        if self.chunk_metadata is not None:
-            return self.chunk_metadata.document_id
-
-        return None
-
-
-@json_schema_type
-class QueryChunksResponse(BaseModel):
-    """Response from querying chunks in a vector database.
-
-    :param chunks: List of content chunks returned from the query
-    :param scores: Relevance scores corresponding to each returned chunk
-    """
-
-    chunks: list[Chunk]
-    scores: list[float]
-
-
-@json_schema_type
-class VectorStoreFileCounts(BaseModel):
-    """File processing status counts for a vector store.
-
-    :param completed: Number of files that have been successfully processed
-    :param cancelled: Number of files that had their processing cancelled
-    :param failed: Number of files that failed to process
-    :param in_progress: Number of files currently being processed
-    :param total: Total number of files in the vector store
-    """
-
-    completed: int
-    cancelled: int
-    failed: int
-    in_progress: int
-    total: int
-
-
-# TODO: rename this as OpenAIVectorStore
-@json_schema_type
-class VectorStoreObject(BaseModel):
-    """OpenAI Vector Store object.
-
-    :param id: Unique identifier for the vector store
-    :param object: Object type identifier, always "vector_store"
-    :param created_at: Timestamp when the vector store was created
-    :param name: (Optional) Name of the vector store
-    :param usage_bytes: Storage space used by the vector store in bytes
-    :param file_counts: File processing status counts for the vector store
-    :param status: Current status of the vector store
-    :param expires_after: (Optional) Expiration policy for the vector store
-    :param expires_at: (Optional) Timestamp when the vector store will expire
-    :param last_active_at: (Optional) Timestamp of last activity on the vector store
-    :param metadata: Set of key-value pairs that can be attached to the vector store
-    """
-
-    id: str
-    object: str = "vector_store"
-    created_at: int
-    name: str | None = None
-    usage_bytes: int = 0
-    file_counts: VectorStoreFileCounts
-    status: str = "completed"
-    expires_after: dict[str, Any] | None = None
-    expires_at: int | None = None
-    last_active_at: int | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class VectorStoreCreateRequest(BaseModel):
-    """Request to create a vector store.
-
-    :param name: (Optional) Name for the vector store
-    :param file_ids: List of file IDs to include in the vector store
-    :param expires_after: (Optional) Expiration policy for the vector store
-    :param chunking_strategy: (Optional) Strategy for splitting files into chunks
-    :param metadata: Set of key-value pairs that can be attached to the vector store
-    """
-
-    name: str | None = None
-    file_ids: list[str] = Field(default_factory=list)
-    expires_after: dict[str, Any] | None = None
-    chunking_strategy: dict[str, Any] | None = None
-    metadata: dict[str, Any] = Field(default_factory=dict)
-
-
-@json_schema_type
-class VectorStoreModifyRequest(BaseModel):
-    """Request to modify a vector store.
-
-    :param name: (Optional) Updated name for the vector store
-    :param expires_after: (Optional) Updated expiration policy for the vector store
-    :param metadata: (Optional) Updated set of key-value pairs for the vector store
-    """
-
-    name: str | None = None
-    expires_after: dict[str, Any] | None = None
-    metadata: dict[str, Any] | None = None
-
-
-@json_schema_type
-class VectorStoreListResponse(BaseModel):
-    """Response from listing vector stores.
-
-    :param object: Object type identifier, always "list"
-    :param data: List of vector store objects
-    :param first_id: (Optional) ID of the first vector store in the list for pagination
-    :param last_id: (Optional) ID of the last vector store in the list for pagination
-    :param has_more: Whether there are more vector stores available beyond this page
-    """
-
-    object: str = "list"
-    data: list[VectorStoreObject]
-    first_id: str | None = None
-    last_id: str | None = None
-    has_more: bool = False
-
-
-@json_schema_type
-class VectorStoreSearchRequest(BaseModel):
-    """Request to search a vector store.
-
-    :param query: Search query as a string or list of strings
-    :param filters: (Optional) Filters based on file attributes to narrow search results
-    :param max_num_results: Maximum number of results to return, defaults to 10
-    :param ranking_options: (Optional) Options for ranking and filtering search results
-    :param rewrite_query: Whether to rewrite the query for better vector search performance
-    """
-
-    query: str | list[str]
-    filters: dict[str, Any] | None = None
-    max_num_results: int = 10
-    ranking_options: dict[str, Any] | None = None
-    rewrite_query: bool = False
-
-
-@json_schema_type
-class VectorStoreContent(BaseModel):
-    """Content item from a vector store file or search result.
-
-    :param type: Content type, currently only "text" is supported
-    :param text: The actual text content
-    :param embedding: Optional embedding vector for this content chunk
-    :param chunk_metadata: Optional chunk metadata
-    :param metadata: Optional user-defined metadata
-    """
-
-    type: Literal["text"]
-    text: str
-    embedding: list[float] | None = None
-    chunk_metadata: ChunkMetadata | None = None
-    metadata: dict[str, Any] | None = None
-
-
-@json_schema_type
-class VectorStoreSearchResponse(BaseModel):
-    """Response from searching a vector store.
-
-    :param file_id: Unique identifier of the file containing the result
-    :param filename: Name of the file containing the result
-    :param score: Relevance score for this search result
-    :param attributes: (Optional) Key-value attributes associated with the file
-    :param content: List of content items matching the search query
-    """
-
-    file_id: str
-    filename: str
-    score: float
-    attributes: dict[str, str | float | bool] | None = None
-    content: list[VectorStoreContent]
-
-
-@json_schema_type
-class VectorStoreSearchResponsePage(BaseModel):
-    """Paginated response from searching a vector store.
-
-    :param object: Object type identifier for the search results page
-    :param search_query: The original search query that was executed
-    :param data: List of search result objects
-    :param has_more: Whether there are more results available beyond this page
-    :param next_page: (Optional) Token for retrieving the next page of results
-    """
-
-    object: str = "vector_store.search_results.page"
-    search_query: list[str]
-    data: list[VectorStoreSearchResponse]
-    has_more: bool = False
-    next_page: str | None = None
-
-
-@json_schema_type
-class VectorStoreDeleteResponse(BaseModel):
-    """Response from deleting a vector store.
-
-    :param id: Unique identifier of the deleted vector store
-    :param object: Object type identifier for the deletion response
-    :param deleted: Whether the deletion operation was successful
-    """
-
-    id: str
-    object: str = "vector_store.deleted"
-    deleted: bool = True
-
-
-@json_schema_type
-class VectorStoreFileContentResponse(BaseModel):
-    """Represents the parsed content of a vector store file.
-
-    :param object: The object type, which is always `vector_store.file_content.page`
-    :param data: Parsed content of the file
-    :param has_more: Indicates if there are more content pages to fetch
-    :param next_page: The token for the next page, if any
-    """
-
-    object: Literal["vector_store.file_content.page"] = "vector_store.file_content.page"
-    data: list[VectorStoreContent]
-    has_more: bool = False
-    next_page: str | None = None
-
-
-@json_schema_type
-class VectorStoreChunkingStrategyAuto(BaseModel):
-    """Automatic chunking strategy for vector store files.
-
-    :param type: Strategy type, always "auto" for automatic chunking
-    """
-
-    type: Literal["auto"] = "auto"
-
-
-@json_schema_type
-class VectorStoreChunkingStrategyStaticConfig(BaseModel):
-    """Configuration for static chunking strategy.
-
-    :param chunk_overlap_tokens: Number of tokens to overlap between adjacent chunks
-    :param max_chunk_size_tokens: Maximum number of tokens per chunk, must be between 100 and 4096
-    """
-
-    chunk_overlap_tokens: int = 400
-    max_chunk_size_tokens: int = Field(800, ge=100, le=4096)
-
-
-@json_schema_type
-class VectorStoreChunkingStrategyStatic(BaseModel):
-    """Static chunking strategy with configurable parameters.
-
-    :param type: Strategy type, always "static" for static chunking
-    :param static: Configuration parameters for the static chunking strategy
-    """
-
-    type: Literal["static"] = "static"
-    static: VectorStoreChunkingStrategyStaticConfig
-
-
-VectorStoreChunkingStrategy = Annotated[
-    VectorStoreChunkingStrategyAuto | VectorStoreChunkingStrategyStatic,
-    Field(discriminator="type"),
-]
-register_schema(VectorStoreChunkingStrategy, name="VectorStoreChunkingStrategy")
-
-
-class SearchRankingOptions(BaseModel):
-    """Options for ranking and filtering search results.
-
-    :param ranker: (Optional) Name of the ranking algorithm to use
-    :param score_threshold: (Optional) Minimum relevance score threshold for results
-    """
-
-    ranker: str | None = None
-    # NOTE: OpenAI File Search Tool requires threshold to be between 0 and 1, however
-    # we don't guarantee that the score is between 0 and 1, so will leave this unconstrained
-    # and let the provider handle it
-    score_threshold: float | None = Field(default=0.0)
-
-
-@json_schema_type
-class VectorStoreFileLastError(BaseModel):
-    """Error information for failed vector store file processing.
-
-    :param code: Error code indicating the type of failure
-    :param message: Human-readable error message describing the failure
-    """
-
-    code: Literal["server_error"] | Literal["rate_limit_exceeded"]
-    message: str
-
-
-VectorStoreFileStatus = Literal["completed"] | Literal["in_progress"] | Literal["cancelled"] | Literal["failed"]
-register_schema(VectorStoreFileStatus, name="VectorStoreFileStatus")
-
-
-@json_schema_type
-class VectorStoreFileObject(BaseModel):
-    """OpenAI Vector Store File object.
-
-    :param id: Unique identifier for the file
-    :param object: Object type identifier, always "vector_store.file"
-    :param attributes: Key-value attributes associated with the file
-    :param chunking_strategy: Strategy used for splitting the file into chunks
-    :param created_at: Timestamp when the file was added to the vector store
-    :param last_error: (Optional) Error information if file processing failed
-    :param status: Current processing status of the file
-    :param usage_bytes: Storage space used by this file in bytes
-    :param vector_store_id: ID of the vector store containing this file
-    """
-
-    id: str
-    object: str = "vector_store.file"
-    attributes: dict[str, Any] = Field(default_factory=dict)
-    chunking_strategy: VectorStoreChunkingStrategy
-    created_at: int
-    last_error: VectorStoreFileLastError | None = None
-    status: VectorStoreFileStatus
-    usage_bytes: int = 0
-    vector_store_id: str
-
-
-@json_schema_type
-class VectorStoreListFilesResponse(BaseModel):
-    """Response from listing files in a vector store.
-
-    :param object: Object type identifier, always "list"
-    :param data: List of vector store file objects
-    :param first_id: (Optional) ID of the first file in the list for pagination
-    :param last_id: (Optional) ID of the last file in the list for pagination
-    :param has_more: Whether there are more files available beyond this page
-    """
-
-    object: str = "list"
-    data: list[VectorStoreFileObject]
-    first_id: str | None = None
-    last_id: str | None = None
-    has_more: bool = False
-
-
-@json_schema_type
-class VectorStoreFileDeleteResponse(BaseModel):
-    """Response from deleting a vector store file.
-
-    :param id: Unique identifier of the deleted file
-    :param object: Object type identifier for the deletion response
-    :param deleted: Whether the deletion operation was successful
-    """
-
-    id: str
-    object: str = "vector_store.file.deleted"
-    deleted: bool = True
-
-
-@json_schema_type
-class VectorStoreFileBatchObject(BaseModel):
-    """OpenAI Vector Store File Batch object.
-
-    :param id: Unique identifier for the file batch
-    :param object: Object type identifier, always "vector_store.file_batch"
-    :param created_at: Timestamp when the file batch was created
-    :param vector_store_id: ID of the vector store containing the file batch
-    :param status: Current processing status of the file batch
-    :param file_counts: File processing status counts for the batch
-    """
-
-    id: str
-    object: str = "vector_store.file_batch"
-    created_at: int
-    vector_store_id: str
-    status: VectorStoreFileStatus
-    file_counts: VectorStoreFileCounts
-
-
-@json_schema_type
-class VectorStoreFilesListInBatchResponse(BaseModel):
-    """Response from listing files in a vector store file batch.
-
-    :param object: Object type identifier, always "list"
-    :param data: List of vector store file objects in the batch
-    :param first_id: (Optional) ID of the first file in the list for pagination
-    :param last_id: (Optional) ID of the last file in the list for pagination
-    :param has_more: Whether there are more files available beyond this page
-    """
-
-    object: str = "list"
-    data: list[VectorStoreFileObject]
-    first_id: str | None = None
-    last_id: str | None = None
-    has_more: bool = False
-
-
-# extra_body can be accessed via .model_extra
-@json_schema_type
-class OpenAICreateVectorStoreRequestWithExtraBody(BaseModel, extra="allow"):
-    """Request to create a vector store with extra_body support.
-
-    :param name: (Optional) A name for the vector store
-    :param file_ids: List of file IDs to include in the vector store
-    :param expires_after: (Optional) Expiration policy for the vector store
-    :param chunking_strategy: (Optional) Strategy for splitting files into chunks
-    :param metadata: Set of key-value pairs that can be attached to the vector store
-    """
-
-    name: str | None = None
-    file_ids: list[str] | None = None
-    expires_after: dict[str, Any] | None = None
-    chunking_strategy: VectorStoreChunkingStrategy | None = None
-    metadata: dict[str, Any] | None = None
-
-
-# extra_body can be accessed via .model_extra
-@json_schema_type
-class OpenAICreateVectorStoreFileBatchRequestWithExtraBody(BaseModel, extra="allow"):
-    """Request to create a vector store file batch with extra_body support.
-
-    :param file_ids: A list of File IDs that the vector store should use
-    :param attributes: (Optional) Key-value attributes to store with the files
-    :param chunking_strategy: (Optional) The chunking strategy used to chunk the file(s). Defaults to auto
-    """
-
-    file_ids: list[str]
-    attributes: dict[str, Any] | None = None
-    chunking_strategy: VectorStoreChunkingStrategy | None = None
-
-
-class VectorStoreTable(Protocol):
-    def get_vector_store(self, vector_store_id: str) -> VectorStore | None: ...
-
-
-@runtime_checkable
-@telemetry_traceable
-class VectorIO(Protocol):
-    vector_store_table: VectorStoreTable | None = None
-
-    # this will just block now until chunks are inserted, but it should
-    # probably return a Job instance which can be polled for completion
-    # TODO: rename vector_store_id to vector_store_id once Stainless is working
-    @webmethod(route="/vector-io/insert", method="POST", level=LLAMA_STACK_API_V1)
-    async def insert_chunks(
-        self,
-        vector_store_id: str,
-        chunks: list[Chunk],
-        ttl_seconds: int | None = None,
-    ) -> None:
-        """Insert chunks into a vector database.
-
-        :param vector_store_id: The identifier of the vector database to insert the chunks into.
-        :param chunks: The chunks to insert. Each `Chunk` should contain content which can be interleaved text, images, or other types.
-            `metadata`: `dict[str, Any]` and `embedding`: `List[float]` are optional.
-            If `metadata` is provided, you configure how Llama Stack formats the chunk during generation.
-            If `embedding` is not provided, it will be computed later.
-        :param ttl_seconds: The time to live of the chunks.
-        """
-        ...
-
-    # TODO: rename vector_store_id to vector_store_id once Stainless is working
-    @webmethod(route="/vector-io/query", method="POST", level=LLAMA_STACK_API_V1)
-    async def query_chunks(
-        self,
-        vector_store_id: str,
-        query: InterleavedContent,
-        params: dict[str, Any] | None = None,
-    ) -> QueryChunksResponse:
-        """Query chunks from a vector database.
-
-        :param vector_store_id: The identifier of the vector database to query.
-        :param query: The query to search for.
-        :param params: The parameters of the query.
-        :returns: A QueryChunksResponse.
-        """
-        ...
-
-    # OpenAI Vector Stores API endpoints
-    @webmethod(route="/vector_stores", method="POST", level=LLAMA_STACK_API_V1)
-    async def openai_create_vector_store(
-        self,
-        params: Annotated[OpenAICreateVectorStoreRequestWithExtraBody, Body(...)],
-    ) -> VectorStoreObject:
-        """Creates a vector store.
-
-        Generate an OpenAI-compatible vector store with the given parameters.
-        :returns: A VectorStoreObject representing the created vector store.
-        """
-        ...
-
-    @webmethod(route="/vector_stores", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_list_vector_stores(
-        self,
-        limit: int | None = 20,
-        order: str | None = "desc",
-        after: str | None = None,
-        before: str | None = None,
-    ) -> VectorStoreListResponse:
-        """Returns a list of vector stores.
-
-        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
-        :param order: Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.
-        :param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list.
-        :param before: A cursor for use in pagination. `before` is an object ID that defines your place in the list.
-        :returns: A VectorStoreListResponse containing the list of vector stores.
-        """
-        ...
-
-    @webmethod(route="/vector_stores/{vector_store_id}", method="GET", level=LLAMA_STACK_API_V1)
-    async def openai_retrieve_vector_store(
-        self,
-        vector_store_id: str,
-    ) -> VectorStoreObject:
-        """Retrieves a vector store.
-
-        :param vector_store_id: The ID of the vector store to retrieve.
-        :returns: A VectorStoreObject representing the vector store.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_update_vector_store(
-        self,
-        vector_store_id: str,
-        name: str | None = None,
-        expires_after: dict[str, Any] | None = None,
-        metadata: dict[str, Any] | None = None,
-    ) -> VectorStoreObject:
-        """Updates a vector store.
-
-        :param vector_store_id: The ID of the vector store to update.
-        :param name: The name of the vector store.
-        :param expires_after: The expiration policy for a vector store.
-        :param metadata: Set of 16 key-value pairs that can be attached to an object.
-        :returns: A VectorStoreObject representing the updated vector store.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}",
-        method="DELETE",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_delete_vector_store(
-        self,
-        vector_store_id: str,
-    ) -> VectorStoreDeleteResponse:
-        """Delete a vector store.
-
-        :param vector_store_id: The ID of the vector store to delete.
-        :returns: A VectorStoreDeleteResponse indicating the deletion status.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/search",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_search_vector_store(
-        self,
-        vector_store_id: str,
-        query: str | list[str],
-        filters: dict[str, Any] | None = None,
-        max_num_results: int | None = 10,
-        ranking_options: SearchRankingOptions | None = None,
-        rewrite_query: bool | None = False,
-        search_mode: (
-            str | None
-        ) = "vector",  # Using str instead of Literal due to OpenAPI schema generator limitations
-    ) -> VectorStoreSearchResponsePage:
-        """Search for chunks in a vector store.
-
-        Searches a vector store for relevant chunks based on a query and optional file attribute filters.
-
-        :param vector_store_id: The ID of the vector store to search.
-        :param query: The query string or array for performing the search.
-        :param filters: Filters based on file attributes to narrow the search results.
-        :param max_num_results: Maximum number of results to return (1 to 50 inclusive, default 10).
-        :param ranking_options: Ranking options for fine-tuning the search results.
-        :param rewrite_query: Whether to rewrite the natural language query for vector search (default false)
-        :param search_mode: The search mode to use - "keyword", "vector", or "hybrid" (default "vector")
-        :returns: A VectorStoreSearchResponse containing the search results.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_attach_file_to_vector_store(
-        self,
-        vector_store_id: str,
-        file_id: str,
-        attributes: dict[str, Any] | None = None,
-        chunking_strategy: VectorStoreChunkingStrategy | None = None,
-    ) -> VectorStoreFileObject:
-        """Attach a file to a vector store.
-
-        :param vector_store_id: The ID of the vector store to attach the file to.
-        :param file_id: The ID of the file to attach to the vector store.
-        :param attributes: The key-value attributes stored with the file, which can be used for filtering.
-        :param chunking_strategy: The chunking strategy to use for the file.
-        :returns: A VectorStoreFileObject representing the attached file.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_list_files_in_vector_store(
-        self,
-        vector_store_id: str,
-        limit: int | None = 20,
-        order: str | None = "desc",
-        after: str | None = None,
-        before: str | None = None,
-        filter: VectorStoreFileStatus | None = None,
-    ) -> VectorStoreListFilesResponse:
-        """List files in a vector store.
-
-        :param vector_store_id: The ID of the vector store to list files from.
-        :param limit: (Optional) A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
-        :param order: (Optional) Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.
-        :param after: (Optional) A cursor for use in pagination. `after` is an object ID that defines your place in the list.
-        :param before: (Optional) A cursor for use in pagination. `before` is an object ID that defines your place in the list.
-        :param filter: (Optional) Filter by file status to only return files with the specified status.
-        :returns: A VectorStoreListFilesResponse containing the list of files.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files/{file_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_retrieve_vector_store_file(
-        self,
-        vector_store_id: str,
-        file_id: str,
-    ) -> VectorStoreFileObject:
-        """Retrieves a vector store file.
-
-        :param vector_store_id: The ID of the vector store containing the file to retrieve.
-        :param file_id: The ID of the file to retrieve.
-        :returns: A VectorStoreFileObject representing the file.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files/{file_id}/content",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_retrieve_vector_store_file_contents(
-        self,
-        vector_store_id: str,
-        file_id: str,
-        include_embeddings: Annotated[bool | None, Query(default=False)] = False,
-        include_metadata: Annotated[bool | None, Query(default=False)] = False,
-    ) -> VectorStoreFileContentResponse:
-        """Retrieves the contents of a vector store file.
-
-        :param vector_store_id: The ID of the vector store containing the file to retrieve.
-        :param file_id: The ID of the file to retrieve.
-        :param include_embeddings: Whether to include embedding vectors in the response.
-        :param include_metadata: Whether to include chunk metadata in the response.
-        :returns: File contents, optionally with embeddings and metadata based on query parameters.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files/{file_id}",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_update_vector_store_file(
-        self,
-        vector_store_id: str,
-        file_id: str,
-        attributes: dict[str, Any],
-    ) -> VectorStoreFileObject:
-        """Updates a vector store file.
-
-        :param vector_store_id: The ID of the vector store containing the file to update.
-        :param file_id: The ID of the file to update.
-        :param attributes: The updated key-value attributes to store with the file.
-        :returns: A VectorStoreFileObject representing the updated file.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/files/{file_id}",
-        method="DELETE",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_delete_vector_store_file(
-        self,
-        vector_store_id: str,
-        file_id: str,
-    ) -> VectorStoreFileDeleteResponse:
-        """Delete a vector store file.
-
-        :param vector_store_id: The ID of the vector store containing the file to delete.
-        :param file_id: The ID of the file to delete.
-        :returns: A VectorStoreFileDeleteResponse indicating the deletion status.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/file_batches",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_create_vector_store_file_batch(
-        self,
-        vector_store_id: str,
-        params: Annotated[OpenAICreateVectorStoreFileBatchRequestWithExtraBody, Body(...)],
-    ) -> VectorStoreFileBatchObject:
-        """Create a vector store file batch.
-
-        Generate an OpenAI-compatible vector store file batch for the given vector store.
-        :param vector_store_id: The ID of the vector store to create the file batch for.
-        :returns: A VectorStoreFileBatchObject representing the created file batch.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_retrieve_vector_store_file_batch(
-        self,
-        batch_id: str,
-        vector_store_id: str,
-    ) -> VectorStoreFileBatchObject:
-        """Retrieve a vector store file batch.
-
-        :param batch_id: The ID of the file batch to retrieve.
-        :param vector_store_id: The ID of the vector store containing the file batch.
-        :returns: A VectorStoreFileBatchObject representing the file batch.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}/files",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_list_files_in_vector_store_file_batch(
-        self,
-        batch_id: str,
-        vector_store_id: str,
-        after: str | None = None,
-        before: str | None = None,
-        filter: str | None = None,
-        limit: int | None = 20,
-        order: str | None = "desc",
-    ) -> VectorStoreFilesListInBatchResponse:
-        """Returns a list of vector store files in a batch.
-
-        :param batch_id: The ID of the file batch to list files from.
-        :param vector_store_id: The ID of the vector store containing the file batch.
-        :param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list.
-        :param before: A cursor for use in pagination. `before` is an object ID that defines your place in the list.
-        :param filter: Filter by file status. One of in_progress, completed, failed, cancelled.
-        :param limit: A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
-        :param order: Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and `desc` for descending order.
-        :returns: A VectorStoreFilesListInBatchResponse containing the list of files in the batch.
-        """
-        ...
-
-    @webmethod(
-        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-    )
-    async def openai_cancel_vector_store_file_batch(
-        self,
-        batch_id: str,
-        vector_store_id: str,
-    ) -> VectorStoreFileBatchObject:
-        """Cancels a vector store file batch.
-
-        :param batch_id: The ID of the file batch to cancel.
-        :param vector_store_id: The ID of the vector store containing the file batch.
-        :returns: A VectorStoreFileBatchObject representing the cancelled file batch.
-        """
-        ...
--- a/src/llama_stack/apis/vector_stores/init.py
+++ b/src/llama_stack/apis/vector_stores/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .vector_stores import *
--- a/src/llama_stack/apis/vector_stores/vector_stores.py
+++ b/src/llama_stack/apis/vector_stores/vector_stores.py
@ -1,51 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Literal
-
-from pydantic import BaseModel
-
-from llama_stack.apis.resource import Resource, ResourceType
-
-
-# Internal resource type for storing the vector store routing and other information
-class VectorStore(Resource):
-    """Vector database resource for storing and querying vector embeddings.
-
-    :param type: Type of resource, always 'vector_store' for vector stores
-    :param embedding_model: Name of the embedding model to use for vector generation
-    :param embedding_dimension: Dimension of the embedding vectors
-    """
-
-    type: Literal[ResourceType.vector_store] = ResourceType.vector_store
-
-    embedding_model: str
-    embedding_dimension: int
-    vector_store_name: str | None = None
-
-    @property
-    def vector_store_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_vector_store_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class VectorStoreInput(BaseModel):
-    """Input parameters for creating or configuring a vector database.
-
-    :param vector_store_id: Unique identifier for the vector store
-    :param embedding_model: Name of the embedding model to use for vector generation
-    :param embedding_dimension: Dimension of the embedding vectors
-    :param provider_vector_store_id: (Optional) Provider-specific identifier for the vector store
-    """
-
-    vector_store_id: str
-    embedding_model: str
-    embedding_dimension: int
-    provider_id: str | None = None
-    provider_vector_store_id: str | None = None
--- a/src/llama_stack/apis/version.py
+++ b/src/llama_stack/apis/version.py
@ -1,9 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-LLAMA_STACK_API_V1 = "v1"
-LLAMA_STACK_API_V1BETA = "v1beta"
-LLAMA_STACK_API_V1ALPHA = "v1alpha"
--- a/src/llama_stack/cli/stack/_list_deps.py
+++ b/src/llama_stack/cli/stack/_list_deps.py
@ -9,6 +9,7 @@ import sys
 from pathlib import Path

 import yaml
+from llama_stack_api import Api
 from termcolor import cprint

 from llama_stack.cli.stack.utils import ImageType
@ -21,7 +22,6 @@ from llama_stack.core.datatypes import (
 from llama_stack.core.distribution import get_provider_registry
 from llama_stack.core.stack import replace_env_vars
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api

 TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates"

--- a/src/llama_stack/cli/stack/utils.py
+++ b/src/llama_stack/cli/stack/utils.py
@ -11,6 +11,7 @@ from functools import lru_cache
 from pathlib import Path

 import yaml
+from llama_stack_api import Api
 from termcolor import cprint

 from llama_stack.core.datatypes import (
@ -32,7 +33,6 @@ from llama_stack.core.storage.datatypes import (
 from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR, EXTERNAL_PROVIDERS_DIR
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.image_types import LlamaStackImageType
-from llama_stack.providers.datatypes import Api

 TEMPLATES_PATH = Path(__file__).parent.parent.parent / "distributions"

--- a/src/llama_stack/core/build.py
+++ b/src/llama_stack/core/build.py
@ -6,6 +6,7 @@

 import sys

+from llama_stack_api import Api
 from pydantic import BaseModel
 from termcolor import cprint

@ -13,7 +14,6 @@ from llama_stack.core.datatypes import BuildConfig
 from llama_stack.core.distribution import get_provider_registry
 from llama_stack.distributions.template import DistributionTemplate
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api

 log = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/client.py
+++ b/src/llama_stack/core/client.py
@ -12,11 +12,10 @@ from enum import Enum
 from typing import Any, Union, get_args, get_origin

 import httpx
+from llama_stack_api import RemoteProviderConfig
 from pydantic import BaseModel, parse_obj_as
 from termcolor import cprint

-from llama_stack.providers.datatypes import RemoteProviderConfig
-
 _CLIENT_CLASSES = {}


--- a/src/llama_stack/core/configure.py
+++ b/src/llama_stack/core/configure.py
@ -6,6 +6,8 @@
 import textwrap
 from typing import Any

+from llama_stack_api import Api, ProviderSpec
+
 from llama_stack.core.datatypes import (
    LLAMA_STACK_RUN_CONFIG_VERSION,
    DistributionSpec,
@ -20,7 +22,6 @@ from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.prompt_for_config import prompt_for_config
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api, ProviderSpec

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/conversations/conversations.py
+++ b/src/llama_stack/core/conversations/conversations.py
@ -8,9 +8,7 @@ import secrets
 import time
 from typing import Any, Literal

-from pydantic import BaseModel, TypeAdapter
-
-from llama_stack.apis.conversations.conversations import (
+from llama_stack_api import (
    Conversation,
    ConversationDeletedResource,
    ConversationItem,
@ -20,6 +18,8 @@ from llama_stack.apis.conversations.conversations import (
    Conversations,
    Metadata,
 )
+from pydantic import BaseModel, TypeAdapter
+
 from llama_stack.core.datatypes import AccessRule, StackRunConfig
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
--- a/src/llama_stack/core/datatypes.py
+++ b/src/llama_stack/core/datatypes.py
@ -9,22 +9,34 @@ from pathlib import Path
 from typing import Annotated, Any, Literal, Self
 from urllib.parse import urlparse

+from llama_stack_api import (
+    Api,
+    Benchmark,
+    BenchmarkInput,
+    Dataset,
+    DatasetInput,
+    DatasetIO,
+    Eval,
+    Inference,
+    Model,
+    ModelInput,
+    ProviderSpec,
+    Resource,
+    Safety,
+    Scoring,
+    ScoringFn,
+    ScoringFnInput,
+    Shield,
+    ShieldInput,
+    ToolGroup,
+    ToolGroupInput,
+    ToolRuntime,
+    VectorIO,
+    VectorStore,
+    VectorStoreInput,
+)
 from pydantic import BaseModel, Field, field_validator, model_validator

-from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Dataset, DatasetInput
-from llama_stack.apis.eval import Eval
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.models import Model, ModelInput
-from llama_stack.apis.resource import Resource
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnInput
-from llama_stack.apis.shields import Shield, ShieldInput
-from llama_stack.apis.tools import ToolGroup, ToolGroupInput, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
-from llama_stack.apis.vector_stores import VectorStore, VectorStoreInput
 from llama_stack.core.access_control.datatypes import AccessRule
 from llama_stack.core.storage.datatypes import (
    KVStoreReference,
@ -32,7 +44,6 @@ from llama_stack.core.storage.datatypes import (
    StorageConfig,
 )
 from llama_stack.log import LoggingConfig
-from llama_stack.providers.datatypes import Api, ProviderSpec

 LLAMA_STACK_BUILD_CONFIG_VERSION = 2
 LLAMA_STACK_RUN_CONFIG_VERSION = 2
--- a/src/llama_stack/core/distribution.py
+++ b/src/llama_stack/core/distribution.py
@ -10,17 +10,17 @@ import os
 from typing import Any

 import yaml
-from pydantic import BaseModel
-
-from llama_stack.core.datatypes import BuildConfig, DistributionSpec
-from llama_stack.core.external import load_external_apis
-from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    InlineProviderSpec,
    ProviderSpec,
    RemoteProviderSpec,
 )
+from pydantic import BaseModel
+
+from llama_stack.core.datatypes import BuildConfig, DistributionSpec
+from llama_stack.core.external import load_external_apis
+from llama_stack.log import get_logger

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/external.py
+++ b/src/llama_stack/core/external.py
@ -6,8 +6,8 @@


 import yaml
+from llama_stack_api import Api, ExternalApiSpec

-from llama_stack.apis.datatypes import Api, ExternalApiSpec
 from llama_stack.core.datatypes import BuildConfig, StackRunConfig
 from llama_stack.log import get_logger

--- a/src/llama_stack/core/inspect.py
+++ b/src/llama_stack/core/inspect.py
@ -6,19 +6,19 @@

 from importlib.metadata import version

-from pydantic import BaseModel
-
-from llama_stack.apis.inspect import (
+from llama_stack_api import (
    HealthInfo,
+    HealthStatus,
    Inspect,
    ListRoutesResponse,
    RouteInfo,
    VersionInfo,
 )
+from pydantic import BaseModel
+
 from llama_stack.core.datatypes import StackRunConfig
 from llama_stack.core.external import load_external_apis
 from llama_stack.core.server.routes import get_all_api_routes
-from llama_stack.providers.datatypes import HealthStatus


 class DistributionInspectConfig(BaseModel):
--- a/src/llama_stack/core/library_client.py
+++ b/src/llama_stack/core/library_client.py
@ -18,6 +18,7 @@ from typing import Any, TypeVar, Union, get_args, get_origin
 import httpx
 import yaml
 from fastapi import Response as FastAPIResponse
+from llama_stack_api import is_unwrapped_body_param

 try:
    from llama_stack_client import (
@ -57,7 +58,6 @@ from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.core.utils.exec import in_notebook
 from llama_stack.log import get_logger, setup_logging
-from llama_stack.strong_typing.inspection import is_unwrapped_body_param

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/prompts/prompts.py
+++ b/src/llama_stack/core/prompts/prompts.py
@ -7,9 +7,9 @@
 import json
 from typing import Any

+from llama_stack_api import ListPromptsResponse, Prompt, Prompts
 from pydantic import BaseModel

-from llama_stack.apis.prompts import ListPromptsResponse, Prompt, Prompts
 from llama_stack.core.datatypes import StackRunConfig
 from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl

--- a/src/llama_stack/core/providers.py
+++ b/src/llama_stack/core/providers.py
@ -7,11 +7,10 @@
 import asyncio
 from typing import Any

+from llama_stack_api import HealthResponse, HealthStatus, ListProvidersResponse, ProviderInfo, Providers
 from pydantic import BaseModel

-from llama_stack.apis.providers import ListProvidersResponse, ProviderInfo, Providers
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus

 from .datatypes import StackRunConfig
 from .utils.config import redact_sensitive_fields
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -8,29 +8,46 @@ import importlib.metadata
 import inspect
 from typing import Any

-from llama_stack.apis.agents import Agents
-from llama_stack.apis.batches import Batches
-from llama_stack.apis.benchmarks import Benchmarks
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.datatypes import ExternalApiSpec
-from llama_stack.apis.eval import Eval
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference, InferenceProvider
-from llama_stack.apis.inspect import Inspect
-from llama_stack.apis.models import Models
-from llama_stack.apis.post_training import PostTraining
-from llama_stack.apis.prompts import Prompts
-from llama_stack.apis.providers import Providers as ProvidersAPI
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFunctions
-from llama_stack.apis.shields import Shields
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
+from llama_stack_api import (
+    LLAMA_STACK_API_V1ALPHA,
+    Agents,
+    Api,
+    Batches,
+    Benchmarks,
+    BenchmarksProtocolPrivate,
+    Conversations,
+    DatasetIO,
+    Datasets,
+    DatasetsProtocolPrivate,
+    Eval,
+    ExternalApiSpec,
+    Files,
+    Inference,
+    InferenceProvider,
+    Inspect,
+    Models,
+    ModelsProtocolPrivate,
+    PostTraining,
+    Prompts,
+    ProviderSpec,
+    RemoteProviderConfig,
+    RemoteProviderSpec,
+    Safety,
+    Scoring,
+    ScoringFunctions,
+    ScoringFunctionsProtocolPrivate,
+    Shields,
+    ShieldsProtocolPrivate,
+    ToolGroups,
+    ToolGroupsProtocolPrivate,
+    ToolRuntime,
+    VectorIO,
+    VectorStore,
+)
+from llama_stack_api import (
+    Providers as ProvidersAPI,
+)
+
 from llama_stack.core.client import get_client_impl
 from llama_stack.core.datatypes import (
    AccessRule,
@ -44,18 +61,6 @@ from llama_stack.core.external import load_external_apis
 from llama_stack.core.store import DistributionRegistry
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import (
-    Api,
-    BenchmarksProtocolPrivate,
-    DatasetsProtocolPrivate,
-    ModelsProtocolPrivate,
-    ProviderSpec,
-    RemoteProviderConfig,
-    RemoteProviderSpec,
-    ScoringFunctionsProtocolPrivate,
-    ShieldsProtocolPrivate,
-    ToolGroupsProtocolPrivate,
-)

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/routers/init.py
+++ b/src/llama_stack/core/routers/init.py
@ -6,13 +6,14 @@

 from typing import Any

+from llama_stack_api import Api, RoutingTable
+
 from llama_stack.core.datatypes import (
    AccessRule,
    RoutedProtocol,
 )
 from llama_stack.core.stack import StackRunConfig
 from llama_stack.core.store import DistributionRegistry
-from llama_stack.providers.datatypes import Api, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore


--- a/src/llama_stack/core/routers/datasets.py
+++ b/src/llama_stack/core/routers/datasets.py
@ -6,11 +6,9 @@

 from typing import Any

-from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import DatasetPurpose, DataSource
+from llama_stack_api import DatasetIO, DatasetPurpose, DataSource, PaginatedResponse, RoutingTable
+
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import RoutingTable

 logger = get_logger(name=__name__, category="core::routers")

--- a/src/llama_stack/core/routers/eval_scoring.py
+++ b/src/llama_stack/core/routers/eval_scoring.py
@ -6,15 +6,19 @@

 from typing import Any

-from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job
-from llama_stack.apis.scoring import (
+from llama_stack_api import (
+    BenchmarkConfig,
+    Eval,
+    EvaluateResponse,
+    Job,
+    RoutingTable,
    ScoreBatchResponse,
    ScoreResponse,
    Scoring,
    ScoringFnParams,
 )
+
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import RoutingTable

 logger = get_logger(name=__name__, category="core::routers")

--- a/src/llama_stack/core/routers/inference.py
+++ b/src/llama_stack/core/routers/inference.py
@ -11,17 +11,19 @@ from datetime import UTC, datetime
 from typing import Annotated, Any

 from fastapi import Body
-from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam
-from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam
-from pydantic import TypeAdapter
-
-from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
-from llama_stack.apis.inference import (
+from llama_stack_api import (
+    HealthResponse,
+    HealthStatus,
    Inference,
    ListOpenAIChatCompletionResponse,
+    ModelNotFoundError,
+    ModelType,
+    ModelTypeError,
    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam,
    OpenAIChatCompletionRequestWithExtraBody,
    OpenAIChatCompletionToolCall,
    OpenAIChatCompletionToolCallFunction,
@ -35,18 +37,17 @@ from llama_stack.apis.inference import (
    OpenAIMessageParam,
    Order,
    RerankResponse,
+    RoutingTable,
 )
-from llama_stack.apis.inference.inference import (
-    OpenAIChatCompletionContentPartImageParam,
-    OpenAIChatCompletionContentPartTextParam,
-)
-from llama_stack.apis.models import ModelType
+from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam
+from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam
+from pydantic import TypeAdapter
+
 from llama_stack.core.telemetry.telemetry import MetricEvent
 from llama_stack.core.telemetry.tracing import enqueue_event, get_current_span
 from llama_stack.log import get_logger
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore

 logger = get_logger(name=__name__, category="core::routers")
--- a/src/llama_stack/core/routers/safety.py
+++ b/src/llama_stack/core/routers/safety.py
@ -6,13 +6,10 @@

 from typing import Any

-from llama_stack.apis.inference import OpenAIMessageParam
-from llama_stack.apis.safety import RunShieldResponse, Safety
-from llama_stack.apis.safety.safety import ModerationObject
-from llama_stack.apis.shields import Shield
+from llama_stack_api import ModerationObject, OpenAIMessageParam, RoutingTable, RunShieldResponse, Safety, Shield
+
 from llama_stack.core.datatypes import SafetyConfig
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import RoutingTable

 logger = get_logger(name=__name__, category="core::routers")

--- a/src/llama_stack/core/routers/tool_runtime.py
+++ b/src/llama_stack/core/routers/tool_runtime.py
@ -6,13 +6,12 @@

 from typing import Any

-from llama_stack.apis.common.content_types import (
+from llama_stack_api import (
    URL,
-)
-from llama_stack.apis.tools import (
    ListToolDefsResponse,
    ToolRuntime,
 )
+
 from llama_stack.log import get_logger

 from ..routing_tables.toolgroups import ToolGroupsRoutingTable
--- a/src/llama_stack/core/routers/vector_io.py
+++ b/src/llama_stack/core/routers/vector_io.py
@ -9,14 +9,16 @@ import uuid
 from typing import Annotated, Any

 from fastapi import Body
-
-from llama_stack.apis.common.content_types import InterleavedContent
-from llama_stack.apis.models import ModelType
-from llama_stack.apis.vector_io import (
+from llama_stack_api import (
    Chunk,
+    HealthResponse,
+    HealthStatus,
+    InterleavedContent,
+    ModelType,
    OpenAICreateVectorStoreFileBatchRequestWithExtraBody,
    OpenAICreateVectorStoreRequestWithExtraBody,
    QueryChunksResponse,
+    RoutingTable,
    SearchRankingOptions,
    VectorIO,
    VectorStoreChunkingStrategy,
@ -33,9 +35,9 @@ from llama_stack.apis.vector_io import (
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
+
 from llama_stack.core.datatypes import VectorStoresConfig
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable

 logger = get_logger(name=__name__, category="core::routers")

--- a/src/llama_stack/core/routing_tables/benchmarks.py
+++ b/src/llama_stack/core/routing_tables/benchmarks.py
@ -6,7 +6,8 @@

 from typing import Any

-from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse
+from llama_stack_api import Benchmark, Benchmarks, ListBenchmarksResponse
+
 from llama_stack.core.datatypes import (
    BenchmarkWithOwner,
 )
--- a/src/llama_stack/core/routing_tables/common.py
+++ b/src/llama_stack/core/routing_tables/common.py
@ -6,9 +6,8 @@

 from typing import Any

-from llama_stack.apis.common.errors import ModelNotFoundError
-from llama_stack.apis.models import Model
-from llama_stack.apis.resource import ResourceType
+from llama_stack_api import Api, Model, ModelNotFoundError, ResourceType, RoutingTable
+
 from llama_stack.core.access_control.access_control import AccessDeniedError, is_action_allowed
 from llama_stack.core.access_control.datatypes import Action
 from llama_stack.core.datatypes import (
@ -21,7 +20,6 @@ from llama_stack.core.datatypes import (
 from llama_stack.core.request_headers import get_authenticated_user
 from llama_stack.core.store import DistributionRegistry
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api, RoutingTable

 logger = get_logger(name=__name__, category="core::routing_tables")

--- a/src/llama_stack/core/routing_tables/datasets.py
+++ b/src/llama_stack/core/routing_tables/datasets.py
@ -7,18 +7,19 @@
 import uuid
 from typing import Any

-from llama_stack.apis.common.errors import DatasetNotFoundError
-from llama_stack.apis.datasets import (
+from llama_stack_api import (
    Dataset,
+    DatasetNotFoundError,
    DatasetPurpose,
    Datasets,
    DatasetType,
    DataSource,
    ListDatasetsResponse,
+    ResourceType,
    RowsDataSource,
    URIDataSource,
 )
-from llama_stack.apis.resource import ResourceType
+
 from llama_stack.core.datatypes import (
    DatasetWithOwner,
 )
--- a/src/llama_stack/core/routing_tables/models.py
+++ b/src/llama_stack/core/routing_tables/models.py
@ -7,8 +7,16 @@
 import time
 from typing import Any

-from llama_stack.apis.common.errors import ModelNotFoundError
-from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType, OpenAIListModelsResponse, OpenAIModel
+from llama_stack_api import (
+    ListModelsResponse,
+    Model,
+    ModelNotFoundError,
+    Models,
+    ModelType,
+    OpenAIListModelsResponse,
+    OpenAIModel,
+)
+
 from llama_stack.core.datatypes import (
    ModelWithOwner,
    RegistryEntrySource,
--- a/src/llama_stack/core/routing_tables/scoring_functions.py
+++ b/src/llama_stack/core/routing_tables/scoring_functions.py
@ -4,14 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import ParamType
-from llama_stack.apis.resource import ResourceType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    ListScoringFunctionsResponse,
+    ParamType,
+    ResourceType,
    ScoringFn,
    ScoringFnParams,
    ScoringFunctions,
 )
+
 from llama_stack.core.datatypes import (
    ScoringFnWithOwner,
 )
--- a/src/llama_stack/core/routing_tables/shields.py
+++ b/src/llama_stack/core/routing_tables/shields.py
@ -6,8 +6,8 @@

 from typing import Any

-from llama_stack.apis.resource import ResourceType
-from llama_stack.apis.shields import ListShieldsResponse, Shield, Shields
+from llama_stack_api import ListShieldsResponse, ResourceType, Shield, Shields
+
 from llama_stack.core.datatypes import (
    ShieldWithOwner,
 )
--- a/src/llama_stack/core/routing_tables/toolgroups.py
+++ b/src/llama_stack/core/routing_tables/toolgroups.py
@ -6,9 +6,16 @@

 from typing import Any

-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.errors import ToolGroupNotFoundError
-from llama_stack.apis.tools import ListToolDefsResponse, ListToolGroupsResponse, ToolDef, ToolGroup, ToolGroups
+from llama_stack_api import (
+    URL,
+    ListToolDefsResponse,
+    ListToolGroupsResponse,
+    ToolDef,
+    ToolGroup,
+    ToolGroupNotFoundError,
+    ToolGroups,
+)
+
 from llama_stack.core.datatypes import AuthenticationRequiredError, ToolGroupWithOwner
 from llama_stack.log import get_logger

--- a/src/llama_stack/core/routing_tables/vector_stores.py
+++ b/src/llama_stack/core/routing_tables/vector_stores.py
@ -6,12 +6,12 @@

 from typing import Any

-from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
-from llama_stack.apis.models import ModelType
-from llama_stack.apis.resource import ResourceType
-
 # Removed VectorStores import to avoid exposing public API
-from llama_stack.apis.vector_io.vector_io import (
+from llama_stack_api import (
+    ModelNotFoundError,
+    ModelType,
+    ModelTypeError,
+    ResourceType,
    SearchRankingOptions,
    VectorStoreChunkingStrategy,
    VectorStoreDeleteResponse,
@ -22,6 +22,7 @@ from llama_stack.apis.vector_io.vector_io import (
    VectorStoreObject,
    VectorStoreSearchResponsePage,
 )
+
 from llama_stack.core.datatypes import (
    VectorStoreWithOwner,
 )
--- a/src/llama_stack/core/server/auth_providers.py
+++ b/src/llama_stack/core/server/auth_providers.py
@ -11,9 +11,9 @@ from urllib.parse import parse_qs, urljoin, urlparse

 import httpx
 import jwt
+from llama_stack_api import TokenValidationError
 from pydantic import BaseModel, Field

-from llama_stack.apis.common.errors import TokenValidationError
 from llama_stack.core.datatypes import (
    AuthenticationConfig,
    CustomAuthConfig,
--- a/src/llama_stack/core/server/routes.py
+++ b/src/llama_stack/core/server/routes.py
@ -10,11 +10,10 @@ from collections.abc import Callable
 from typing import Any

 from aiohttp import hdrs
+from llama_stack_api import Api, ExternalApiSpec, WebMethod
 from starlette.routing import Route

-from llama_stack.apis.datatypes import Api, ExternalApiSpec
 from llama_stack.core.resolver import api_protocol_map
-from llama_stack.schema_utils import WebMethod

 EndpointFunc = Callable[..., Any]
 PathParams = dict[str, str]
--- a/src/llama_stack/core/server/server.py
+++ b/src/llama_stack/core/server/server.py
@ -28,11 +28,10 @@ from fastapi import Path as FastapiPath
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
+from llama_stack_api import Api, ConflictError, PaginatedResponse, ResourceNotFoundError
 from openai import BadRequestError
 from pydantic import BaseModel, ValidationError

-from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
-from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.core.access_control.access_control import AccessDeniedError
 from llama_stack.core.datatypes import (
    AuthenticationRequiredError,
@ -58,7 +57,6 @@ from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
 from llama_stack.core.utils.context import preserve_contexts_async_generator
 from llama_stack.log import LoggingConfig, get_logger, setup_logging
-from llama_stack.providers.datatypes import Api

 from .auth import AuthenticationMiddleware
 from .quota import QuotaMiddleware
--- a/src/llama_stack/core/stack.py
+++ b/src/llama_stack/core/stack.py
@ -12,27 +12,31 @@ import tempfile
 from typing import Any

 import yaml
+from llama_stack_api import (
+    Agents,
+    Api,
+    Batches,
+    Benchmarks,
+    Conversations,
+    DatasetIO,
+    Datasets,
+    Eval,
+    Files,
+    Inference,
+    Inspect,
+    Models,
+    PostTraining,
+    Prompts,
+    Providers,
+    Safety,
+    Scoring,
+    ScoringFunctions,
+    Shields,
+    ToolGroups,
+    ToolRuntime,
+    VectorIO,
+)

-from llama_stack.apis.agents import Agents
-from llama_stack.apis.batches import Batches
-from llama_stack.apis.benchmarks import Benchmarks
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.eval import Eval
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.inspect import Inspect
-from llama_stack.apis.models import Models
-from llama_stack.apis.post_training import PostTraining
-from llama_stack.apis.prompts import Prompts
-from llama_stack.apis.providers import Providers
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.scoring import Scoring
-from llama_stack.apis.scoring_functions import ScoringFunctions
-from llama_stack.apis.shields import Shields
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl
 from llama_stack.core.datatypes import Provider, SafetyConfig, StackRunConfig, VectorStoresConfig
 from llama_stack.core.distribution import get_provider_registry
@ -54,7 +58,6 @@ from llama_stack.core.storage.datatypes import (
 from llama_stack.core.store.registry import create_dist_registry
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Api

 logger = get_logger(name=__name__, category="core")

--- a/src/llama_stack/core/telemetry/telemetry.py
+++ b/src/llama_stack/core/telemetry/telemetry.py
@ -16,6 +16,7 @@ from typing import (
    cast,
 )

+from llama_stack_api import json_schema_type, register_schema
 from opentelemetry import metrics, trace
 from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
@ -28,7 +29,6 @@ from pydantic import BaseModel, Field

 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import Primitive
-from llama_stack.schema_utils import json_schema_type, register_schema

 ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]

--- a/src/llama_stack/distributions/dell/dell.py
+++ b/src/llama_stack/distributions/dell/dell.py
@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.models import ModelType
+from llama_stack_api import ModelType
+
 from llama_stack.core.datatypes import (
    BuildProvider,
    ModelInput,
--- a/src/llama_stack/distributions/meta-reference-gpu/meta_reference.py
+++ b/src/llama_stack/distributions/meta-reference-gpu/meta_reference.py
@ -6,7 +6,8 @@

 from pathlib import Path

-from llama_stack.apis.models import ModelType
+from llama_stack_api import ModelType
+
 from llama_stack.core.datatypes import (
    BuildProvider,
    ModelInput,
--- a/src/llama_stack/distributions/open-benchmark/open_benchmark.py
+++ b/src/llama_stack/distributions/open-benchmark/open_benchmark.py
@ -5,8 +5,8 @@
 # the root directory of this source tree.


-from llama_stack.apis.datasets import DatasetPurpose, URIDataSource
-from llama_stack.apis.models import ModelType
+from llama_stack_api import DatasetPurpose, ModelType, URIDataSource
+
 from llama_stack.core.datatypes import (
    BenchmarkInput,
    BuildProvider,
--- a/src/llama_stack/distributions/starter/starter.py
+++ b/src/llama_stack/distributions/starter/starter.py
@ -7,6 +7,8 @@

 from typing import Any

+from llama_stack_api import RemoteProviderSpec
+
 from llama_stack.core.datatypes import (
    BuildProvider,
    Provider,
@ -19,7 +21,6 @@ from llama_stack.core.datatypes import (
 )
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
-from llama_stack.providers.datatypes import RemoteProviderSpec
 from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
 from llama_stack.providers.inline.inference.sentence_transformers import (
    SentenceTransformersInferenceConfig,
--- a/src/llama_stack/distributions/template.py
+++ b/src/llama_stack/distributions/template.py
@ -10,10 +10,9 @@ from typing import Any, Literal
 import jinja2
 import rich
 import yaml
+from llama_stack_api import DatasetPurpose, ModelType
 from pydantic import BaseModel, Field

-from llama_stack.apis.datasets import DatasetPurpose
-from llama_stack.apis.models import ModelType
 from llama_stack.core.datatypes import (
    LLAMA_STACK_RUN_CONFIG_VERSION,
    Api,
--- a/src/llama_stack/providers/datatypes.py
+++ b/src/llama_stack/providers/datatypes.py
@ -1,217 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import StrEnum
-from typing import Any, Protocol
-from urllib.parse import urlparse
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.benchmarks import Benchmark
-from llama_stack.apis.datasets import Dataset
-from llama_stack.apis.datatypes import Api
-from llama_stack.apis.models import Model
-from llama_stack.apis.scoring_functions import ScoringFn
-from llama_stack.apis.shields import Shield
-from llama_stack.apis.tools import ToolGroup
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.schema_utils import json_schema_type
-
-
-class ModelsProtocolPrivate(Protocol):
-    """
-    Protocol for model management.
-
-    This allows users to register their preferred model identifiers.
-
-    Model registration requires -
-     - a provider, used to route the registration request
-     - a model identifier, user's intended name for the model during inference
-     - a provider model identifier, a model identifier supported by the provider
-
-    Providers will only accept registration for provider model ids they support.
-
-    Example,
-      register: provider x my-model-id x provider-model-id
-       -> Error if provider does not support provider-model-id
-       -> Error if my-model-id is already registered
-       -> Success if provider supports provider-model-id
-      inference: my-model-id x ...
-       -> Provider uses provider-model-id for inference
-    """
-
-    # this should be called `on_model_register` or something like that.
-    # the provider should _not_ be able to change the object in this
-    # callback
-    async def register_model(self, model: Model) -> Model: ...
-
-    async def unregister_model(self, model_id: str) -> None: ...
-
-    # the Stack router will query each provider for their list of models
-    # if a `refresh_interval_seconds` is provided, this method will be called
-    # periodically to refresh the list of models
-    #
-    # NOTE: each model returned will be registered with the model registry. this means
-    # a callback to the `register_model()` method will be made. this is duplicative and
-    # may be removed in the future.
-    async def list_models(self) -> list[Model] | None: ...
-
-    async def should_refresh_models(self) -> bool: ...
-
-
-class ShieldsProtocolPrivate(Protocol):
-    async def register_shield(self, shield: Shield) -> None: ...
-
-    async def unregister_shield(self, identifier: str) -> None: ...
-
-
-class VectorStoresProtocolPrivate(Protocol):
-    async def register_vector_store(self, vector_store: VectorStore) -> None: ...
-
-    async def unregister_vector_store(self, vector_store_id: str) -> None: ...
-
-
-class DatasetsProtocolPrivate(Protocol):
-    async def register_dataset(self, dataset: Dataset) -> None: ...
-
-    async def unregister_dataset(self, dataset_id: str) -> None: ...
-
-
-class ScoringFunctionsProtocolPrivate(Protocol):
-    async def list_scoring_functions(self) -> list[ScoringFn]: ...
-
-    async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
-
-
-class BenchmarksProtocolPrivate(Protocol):
-    async def register_benchmark(self, benchmark: Benchmark) -> None: ...
-
-
-class ToolGroupsProtocolPrivate(Protocol):
-    async def register_toolgroup(self, toolgroup: ToolGroup) -> None: ...
-
-    async def unregister_toolgroup(self, toolgroup_id: str) -> None: ...
-
-
-@json_schema_type
-class ProviderSpec(BaseModel):
-    api: Api
-    provider_type: str
-    config_class: str = Field(
-        ...,
-        description="Fully-qualified classname of the config for this provider",
-    )
-    api_dependencies: list[Api] = Field(
-        default_factory=list,
-        description="Higher-level API surfaces may depend on other providers to provide their functionality",
-    )
-    optional_api_dependencies: list[Api] = Field(
-        default_factory=list,
-    )
-    deprecation_warning: str | None = Field(
-        default=None,
-        description="If this provider is deprecated, specify the warning message here",
-    )
-    deprecation_error: str | None = Field(
-        default=None,
-        description="If this provider is deprecated and does NOT work, specify the error message here",
-    )
-
-    module: str | None = Field(
-        default=None,
-        description="""
- Fully-qualified name of the module to import. The module is expected to have:
-
-  - `get_adapter_impl(config, deps)`: returns the adapter implementation
-
-  Example: `module: ramalama_stack`
- """,
-    )
-
-    pip_packages: list[str] = Field(
-        default_factory=list,
-        description="The pip dependencies needed for this implementation",
-    )
-
-    provider_data_validator: str | None = Field(
-        default=None,
-    )
-
-    is_external: bool = Field(default=False, description="Notes whether this provider is an external provider.")
-
-    # used internally by the resolver; this is a hack for now
-    deps__: list[str] = Field(default_factory=list)
-
-    @property
-    def is_sample(self) -> bool:
-        return self.provider_type in ("sample", "remote::sample")
-
-
-class RoutingTable(Protocol):
-    async def get_provider_impl(self, routing_key: str) -> Any: ...
-
-
-@json_schema_type
-class InlineProviderSpec(ProviderSpec):
-    container_image: str | None = Field(
-        default=None,
-        description="""
-The container image to use for this implementation. If one is provided, pip_packages will be ignored.
-If a provider depends on other providers, the dependencies MUST NOT specify a container image.
-""",
-    )
-    description: str | None = Field(
-        default=None,
-        description="""
-A description of the provider. This is used to display in the documentation.
-""",
-    )
-
-
-class RemoteProviderConfig(BaseModel):
-    host: str = "localhost"
-    port: int | None = None
-    protocol: str = "http"
-
-    @property
-    def url(self) -> str:
-        if self.port is None:
-            return f"{self.protocol}://{self.host}"
-        return f"{self.protocol}://{self.host}:{self.port}"
-
-    @classmethod
-    def from_url(cls, url: str) -> "RemoteProviderConfig":
-        parsed = urlparse(url)
-        attrs = {k: v for k, v in parsed._asdict().items() if v is not None}
-        return cls(**attrs)
-
-
-@json_schema_type
-class RemoteProviderSpec(ProviderSpec):
-    adapter_type: str = Field(
-        ...,
-        description="Unique identifier for this adapter",
-    )
-
-    description: str | None = Field(
-        default=None,
-        description="""
-A description of the provider. This is used to display in the documentation.
-""",
-    )
-
-    @property
-    def container_image(self) -> str | None:
-        return None
-
-
-class HealthStatus(StrEnum):
-    OK = "OK"
-    ERROR = "Error"
-    NOT_IMPLEMENTED = "Not Implemented"
-
-
-HealthResponse = dict[str, Any]
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -5,25 +5,26 @@
 # the root directory of this source tree.


-from llama_stack.apis.agents import (
+from llama_stack_api import (
    Agents,
+    Conversations,
+    Inference,
    ListOpenAIResponseInputItem,
    ListOpenAIResponseObject,
    OpenAIDeleteResponseObject,
    OpenAIResponseInput,
    OpenAIResponseInputTool,
    OpenAIResponseObject,
+    OpenAIResponsePrompt,
+    OpenAIResponseText,
    Order,
+    ResponseGuardrail,
+    Safety,
+    ToolGroups,
+    ToolRuntime,
+    VectorIO,
 )
-from llama_stack.apis.agents.agents import ResponseGuardrail
-from llama_stack.apis.agents.openai_responses import OpenAIResponsePrompt, OpenAIResponseText
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.inference import (
-    Inference,
-)
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
+
 from llama_stack.core.datatypes import AccessRule
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_impl
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -8,14 +8,15 @@ import time
 import uuid
 from collections.abc import AsyncIterator

-from pydantic import BaseModel, TypeAdapter
-
-from llama_stack.apis.agents import Order
-from llama_stack.apis.agents.agents import ResponseGuardrailSpec
-from llama_stack.apis.agents.openai_responses import (
+from llama_stack_api import (
+    ConversationItem,
+    Conversations,
+    Inference,
+    InvalidConversationIdError,
    ListOpenAIResponseInputItem,
    ListOpenAIResponseObject,
    OpenAIDeleteResponseObject,
+    OpenAIMessageParam,
    OpenAIResponseInput,
    OpenAIResponseInputMessageContentText,
    OpenAIResponseInputTool,
@ -25,20 +26,16 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponsePrompt,
    OpenAIResponseText,
    OpenAIResponseTextFormat,
-)
-from llama_stack.apis.common.errors import (
-    InvalidConversationIdError,
-)
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.conversations.conversations import ConversationItem
-from llama_stack.apis.inference import (
-    Inference,
-    OpenAIMessageParam,
    OpenAISystemMessageParam,
+    Order,
+    ResponseGuardrailSpec,
+    Safety,
+    ToolGroups,
+    ToolRuntime,
+    VectorIO,
 )
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
+from pydantic import BaseModel, TypeAdapter
+
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.responses.responses_store import (
    ResponsesStore,
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -8,10 +8,18 @@ import uuid
 from collections.abc import AsyncIterator
 from typing import Any

-from llama_stack.apis.agents.openai_responses import (
+from llama_stack_api import (
    AllowedToolsFilter,
    ApprovalFilter,
+    Inference,
    MCPListToolsTool,
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionRequestWithExtraBody,
+    OpenAIChatCompletionToolCall,
+    OpenAIChoice,
+    OpenAIMessageParam,
    OpenAIResponseContentPartOutputText,
    OpenAIResponseContentPartReasoningText,
    OpenAIResponseContentPartRefusal,
@ -56,16 +64,7 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseUsageOutputTokensDetails,
    WebSearchToolTypes,
 )
-from llama_stack.apis.inference import (
-    Inference,
-    OpenAIAssistantMessageParam,
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAIChatCompletionRequestWithExtraBody,
-    OpenAIChatCompletionToolCall,
-    OpenAIChoice,
-    OpenAIMessageParam,
-)
+
 from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
@ -1023,9 +1022,9 @@ class StreamingResponseOrchestrator:
        self, tools: list[OpenAIResponseInputTool], output_messages: list[OpenAIResponseOutput]
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
        """Process all tools and emit appropriate streaming events."""
+        from llama_stack_api import ToolDef
        from openai.types.chat import ChatCompletionToolParam

-        from llama_stack.apis.tools import ToolDef
        from llama_stack.models.llama.datatypes import ToolDefinition
        from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool

--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
@ -9,7 +9,12 @@ import json
 from collections.abc import AsyncIterator
 from typing import Any

-from llama_stack.apis.agents.openai_responses import (
+from llama_stack_api import (
+    ImageContentItem,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam,
+    OpenAIChatCompletionToolCall,
+    OpenAIImageURL,
    OpenAIResponseInputToolFileSearch,
    OpenAIResponseInputToolMCP,
    OpenAIResponseObjectStreamResponseFileSearchCallCompleted,
@ -23,22 +28,15 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseObjectStreamResponseWebSearchCallSearching,
    OpenAIResponseOutputMessageFileSearchToolCall,
    OpenAIResponseOutputMessageFileSearchToolCallResults,
-    OpenAIResponseOutputMessageMCPCall,
    OpenAIResponseOutputMessageWebSearchToolCall,
-)
-from llama_stack.apis.common.content_types import (
-    ImageContentItem,
-    TextContentItem,
-)
-from llama_stack.apis.inference import (
-    OpenAIChatCompletionContentPartImageParam,
-    OpenAIChatCompletionContentPartTextParam,
-    OpenAIChatCompletionToolCall,
-    OpenAIImageURL,
    OpenAIToolMessageParam,
+    TextContentItem,
+    ToolGroups,
+    ToolInvocationResult,
+    ToolRuntime,
+    VectorIO,
 )
-from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
+
 from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger

@ -398,6 +396,10 @@ class ToolExecutor:
        # Build output message
        message: Any
        if mcp_tool_to_server and function.name in mcp_tool_to_server:
+            from llama_stack_api import (
+                OpenAIResponseOutputMessageMCPCall,
+            )
+
            message = OpenAIResponseOutputMessageMCPCall(
                id=item_id,
                arguments=function.arguments,
--- a/Show more
+++ b/Show more