feat: split API and provider specs into separate llama-stack-api pkg (#3895)

# What does this PR do? Extract API definitions and provider specifications into a standalone llama-stack-api package that can be published to PyPI independently of the main llama-stack server. see: https://github.com/llamastack/llama-stack/pull/2978 and https://github.com/llamastack/llama-stack/pull/2978#issuecomment-3145115942 Motivation External providers currently import from llama-stack, which overrides the installed version and causes dependency conflicts. This separation allows external providers to: - Install only the type definitions they need without server dependencies - Avoid version conflicts with the installed llama-stack package - Be versioned and released independently This enables us to re-enable external provider module tests that were previously blocked by these import conflicts. Changes - Created llama-stack-api package with minimal dependencies (pydantic, jsonschema) - Moved APIs, providers datatypes, strong_typing, and schema_utils - Updated all imports from llama_stack.* to llama_stack_api.* - Configured local editable install for development workflow - Updated linting and type-checking configuration for both packages Next Steps - Publish llama-stack-api to PyPI - Update external provider dependencies - Re-enable external provider module tests Pre-cursor PRs to this one: - #4093 - #3954 - #4064 These PRs moved key pieces _out_ of the Api pkg, limiting the scope of change here. relates to #3237 ## Test Plan Package builds successfully and can be imported independently. All pre-commit hooks pass with expected exclusions maintained. --------- Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-12-03 18:00:36 +00:00 · 2025-11-13 14:51:17 -05:00 · 2025-11-13 14:51:17 -05:00 · 840ad75fe9
commit 840ad75fe9
parent ceb716b9a0
358 changed files with 2337 additions and 1424 deletions
--- a/src/llama_stack/providers/datatypes.py
+++ b/src/llama_stack/providers/datatypes.py
@ -1,217 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import StrEnum
-from typing import Any, Protocol
-from urllib.parse import urlparse
-
-from pydantic import BaseModel, Field
-
-from llama_stack.apis.benchmarks import Benchmark
-from llama_stack.apis.datasets import Dataset
-from llama_stack.apis.datatypes import Api
-from llama_stack.apis.models import Model
-from llama_stack.apis.scoring_functions import ScoringFn
-from llama_stack.apis.shields import Shield
-from llama_stack.apis.tools import ToolGroup
-from llama_stack.apis.vector_stores import VectorStore
-from llama_stack.schema_utils import json_schema_type
-
-
-class ModelsProtocolPrivate(Protocol):
-    """
-    Protocol for model management.
-
-    This allows users to register their preferred model identifiers.
-
-    Model registration requires -
-     - a provider, used to route the registration request
-     - a model identifier, user's intended name for the model during inference
-     - a provider model identifier, a model identifier supported by the provider
-
-    Providers will only accept registration for provider model ids they support.
-
-    Example,
-      register: provider x my-model-id x provider-model-id
-       -> Error if provider does not support provider-model-id
-       -> Error if my-model-id is already registered
-       -> Success if provider supports provider-model-id
-      inference: my-model-id x ...
-       -> Provider uses provider-model-id for inference
-    """
-
-    # this should be called `on_model_register` or something like that.
-    # the provider should _not_ be able to change the object in this
-    # callback
-    async def register_model(self, model: Model) -> Model: ...
-
-    async def unregister_model(self, model_id: str) -> None: ...
-
-    # the Stack router will query each provider for their list of models
-    # if a `refresh_interval_seconds` is provided, this method will be called
-    # periodically to refresh the list of models
-    #
-    # NOTE: each model returned will be registered with the model registry. this means
-    # a callback to the `register_model()` method will be made. this is duplicative and
-    # may be removed in the future.
-    async def list_models(self) -> list[Model] | None: ...
-
-    async def should_refresh_models(self) -> bool: ...
-
-
-class ShieldsProtocolPrivate(Protocol):
-    async def register_shield(self, shield: Shield) -> None: ...
-
-    async def unregister_shield(self, identifier: str) -> None: ...
-
-
-class VectorStoresProtocolPrivate(Protocol):
-    async def register_vector_store(self, vector_store: VectorStore) -> None: ...
-
-    async def unregister_vector_store(self, vector_store_id: str) -> None: ...
-
-
-class DatasetsProtocolPrivate(Protocol):
-    async def register_dataset(self, dataset: Dataset) -> None: ...
-
-    async def unregister_dataset(self, dataset_id: str) -> None: ...
-
-
-class ScoringFunctionsProtocolPrivate(Protocol):
-    async def list_scoring_functions(self) -> list[ScoringFn]: ...
-
-    async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
-
-
-class BenchmarksProtocolPrivate(Protocol):
-    async def register_benchmark(self, benchmark: Benchmark) -> None: ...
-
-
-class ToolGroupsProtocolPrivate(Protocol):
-    async def register_toolgroup(self, toolgroup: ToolGroup) -> None: ...
-
-    async def unregister_toolgroup(self, toolgroup_id: str) -> None: ...
-
-
-@json_schema_type
-class ProviderSpec(BaseModel):
-    api: Api
-    provider_type: str
-    config_class: str = Field(
-        ...,
-        description="Fully-qualified classname of the config for this provider",
-    )
-    api_dependencies: list[Api] = Field(
-        default_factory=list,
-        description="Higher-level API surfaces may depend on other providers to provide their functionality",
-    )
-    optional_api_dependencies: list[Api] = Field(
-        default_factory=list,
-    )
-    deprecation_warning: str | None = Field(
-        default=None,
-        description="If this provider is deprecated, specify the warning message here",
-    )
-    deprecation_error: str | None = Field(
-        default=None,
-        description="If this provider is deprecated and does NOT work, specify the error message here",
-    )
-
-    module: str | None = Field(
-        default=None,
-        description="""
- Fully-qualified name of the module to import. The module is expected to have:
-
-  - `get_adapter_impl(config, deps)`: returns the adapter implementation
-
-  Example: `module: ramalama_stack`
- """,
-    )
-
-    pip_packages: list[str] = Field(
-        default_factory=list,
-        description="The pip dependencies needed for this implementation",
-    )
-
-    provider_data_validator: str | None = Field(
-        default=None,
-    )
-
-    is_external: bool = Field(default=False, description="Notes whether this provider is an external provider.")
-
-    # used internally by the resolver; this is a hack for now
-    deps__: list[str] = Field(default_factory=list)
-
-    @property
-    def is_sample(self) -> bool:
-        return self.provider_type in ("sample", "remote::sample")
-
-
-class RoutingTable(Protocol):
-    async def get_provider_impl(self, routing_key: str) -> Any: ...
-
-
-@json_schema_type
-class InlineProviderSpec(ProviderSpec):
-    container_image: str | None = Field(
-        default=None,
-        description="""
-The container image to use for this implementation. If one is provided, pip_packages will be ignored.
-If a provider depends on other providers, the dependencies MUST NOT specify a container image.
-""",
-    )
-    description: str | None = Field(
-        default=None,
-        description="""
-A description of the provider. This is used to display in the documentation.
-""",
-    )
-
-
-class RemoteProviderConfig(BaseModel):
-    host: str = "localhost"
-    port: int | None = None
-    protocol: str = "http"
-
-    @property
-    def url(self) -> str:
-        if self.port is None:
-            return f"{self.protocol}://{self.host}"
-        return f"{self.protocol}://{self.host}:{self.port}"
-
-    @classmethod
-    def from_url(cls, url: str) -> "RemoteProviderConfig":
-        parsed = urlparse(url)
-        attrs = {k: v for k, v in parsed._asdict().items() if v is not None}
-        return cls(**attrs)
-
-
-@json_schema_type
-class RemoteProviderSpec(ProviderSpec):
-    adapter_type: str = Field(
-        ...,
-        description="Unique identifier for this adapter",
-    )
-
-    description: str | None = Field(
-        default=None,
-        description="""
-A description of the provider. This is used to display in the documentation.
-""",
-    )
-
-    @property
-    def container_image(self) -> str | None:
-        return None
-
-
-class HealthStatus(StrEnum):
-    OK = "OK"
-    ERROR = "Error"
-    NOT_IMPLEMENTED = "Not Implemented"
-
-
-HealthResponse = dict[str, Any]
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -5,25 +5,26 @@
 # the root directory of this source tree.


-from llama_stack.apis.agents import (
+from llama_stack_api import (
    Agents,
+    Conversations,
+    Inference,
    ListOpenAIResponseInputItem,
    ListOpenAIResponseObject,
    OpenAIDeleteResponseObject,
    OpenAIResponseInput,
    OpenAIResponseInputTool,
    OpenAIResponseObject,
+    OpenAIResponsePrompt,
+    OpenAIResponseText,
    Order,
+    ResponseGuardrail,
+    Safety,
+    ToolGroups,
+    ToolRuntime,
+    VectorIO,
 )
-from llama_stack.apis.agents.agents import ResponseGuardrail
-from llama_stack.apis.agents.openai_responses import OpenAIResponsePrompt, OpenAIResponseText
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.inference import (
-    Inference,
-)
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
+
 from llama_stack.core.datatypes import AccessRule
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.kvstore import InmemoryKVStoreImpl, kvstore_impl
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -8,14 +8,15 @@ import time
 import uuid
 from collections.abc import AsyncIterator

-from pydantic import BaseModel, TypeAdapter
-
-from llama_stack.apis.agents import Order
-from llama_stack.apis.agents.agents import ResponseGuardrailSpec
-from llama_stack.apis.agents.openai_responses import (
+from llama_stack_api import (
+    ConversationItem,
+    Conversations,
+    Inference,
+    InvalidConversationIdError,
    ListOpenAIResponseInputItem,
    ListOpenAIResponseObject,
    OpenAIDeleteResponseObject,
+    OpenAIMessageParam,
    OpenAIResponseInput,
    OpenAIResponseInputMessageContentText,
    OpenAIResponseInputTool,
@ -25,20 +26,16 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponsePrompt,
    OpenAIResponseText,
    OpenAIResponseTextFormat,
-)
-from llama_stack.apis.common.errors import (
-    InvalidConversationIdError,
-)
-from llama_stack.apis.conversations import Conversations
-from llama_stack.apis.conversations.conversations import ConversationItem
-from llama_stack.apis.inference import (
-    Inference,
-    OpenAIMessageParam,
    OpenAISystemMessageParam,
+    Order,
+    ResponseGuardrailSpec,
+    Safety,
+    ToolGroups,
+    ToolRuntime,
+    VectorIO,
 )
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.tools import ToolGroups, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
+from pydantic import BaseModel, TypeAdapter
+
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.responses.responses_store import (
    ResponsesStore,
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -8,10 +8,18 @@ import uuid
 from collections.abc import AsyncIterator
 from typing import Any

-from llama_stack.apis.agents.openai_responses import (
+from llama_stack_api import (
    AllowedToolsFilter,
    ApprovalFilter,
+    Inference,
    MCPListToolsTool,
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionRequestWithExtraBody,
+    OpenAIChatCompletionToolCall,
+    OpenAIChoice,
+    OpenAIMessageParam,
    OpenAIResponseContentPartOutputText,
    OpenAIResponseContentPartReasoningText,
    OpenAIResponseContentPartRefusal,
@ -56,16 +64,7 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseUsageOutputTokensDetails,
    WebSearchToolTypes,
 )
-from llama_stack.apis.inference import (
-    Inference,
-    OpenAIAssistantMessageParam,
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAIChatCompletionRequestWithExtraBody,
-    OpenAIChatCompletionToolCall,
-    OpenAIChoice,
-    OpenAIMessageParam,
-)
+
 from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
@ -1023,9 +1022,9 @@ class StreamingResponseOrchestrator:
        self, tools: list[OpenAIResponseInputTool], output_messages: list[OpenAIResponseOutput]
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
        """Process all tools and emit appropriate streaming events."""
+        from llama_stack_api import ToolDef
        from openai.types.chat import ChatCompletionToolParam

-        from llama_stack.apis.tools import ToolDef
        from llama_stack.models.llama.datatypes import ToolDefinition
        from llama_stack.providers.utils.inference.openai_compat import convert_tooldef_to_openai_tool

--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py
@ -9,7 +9,12 @@ import json
 from collections.abc import AsyncIterator
 from typing import Any

-from llama_stack.apis.agents.openai_responses import (
+from llama_stack_api import (
+    ImageContentItem,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartTextParam,
+    OpenAIChatCompletionToolCall,
+    OpenAIImageURL,
    OpenAIResponseInputToolFileSearch,
    OpenAIResponseInputToolMCP,
    OpenAIResponseObjectStreamResponseFileSearchCallCompleted,
@ -23,22 +28,15 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseObjectStreamResponseWebSearchCallSearching,
    OpenAIResponseOutputMessageFileSearchToolCall,
    OpenAIResponseOutputMessageFileSearchToolCallResults,
-    OpenAIResponseOutputMessageMCPCall,
    OpenAIResponseOutputMessageWebSearchToolCall,
-)
-from llama_stack.apis.common.content_types import (
-    ImageContentItem,
-    TextContentItem,
-)
-from llama_stack.apis.inference import (
-    OpenAIChatCompletionContentPartImageParam,
-    OpenAIChatCompletionContentPartTextParam,
-    OpenAIChatCompletionToolCall,
-    OpenAIImageURL,
    OpenAIToolMessageParam,
+    TextContentItem,
+    ToolGroups,
+    ToolInvocationResult,
+    ToolRuntime,
+    VectorIO,
 )
-from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
-from llama_stack.apis.vector_io import VectorIO
+
 from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger

@ -398,6 +396,10 @@ class ToolExecutor:
        # Build output message
        message: Any
        if mcp_tool_to_server and function.name in mcp_tool_to_server:
+            from llama_stack_api import (
+                OpenAIResponseOutputMessageMCPCall,
+            )
+
            message = OpenAIResponseOutputMessageMCPCall(
                id=item_id,
                arguments=function.arguments,
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/types.py
@ -7,10 +7,10 @@
 from dataclasses import dataclass
 from typing import cast

-from openai.types.chat import ChatCompletionToolParam
-from pydantic import BaseModel
-
-from llama_stack.apis.agents.openai_responses import (
+from llama_stack_api import (
+    OpenAIChatCompletionToolCall,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
    OpenAIResponseInput,
    OpenAIResponseInputTool,
    OpenAIResponseInputToolFileSearch,
@ -26,7 +26,8 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseTool,
    OpenAIResponseToolMCP,
 )
-from llama_stack.apis.inference import OpenAIChatCompletionToolCall, OpenAIMessageParam, OpenAIResponseFormatParam
+from openai.types.chat import ChatCompletionToolParam
+from pydantic import BaseModel


 class ToolExecutionResult(BaseModel):
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/utils.py
@ -9,9 +9,23 @@ import re
 import uuid
 from collections.abc import Sequence

-from llama_stack.apis.agents.agents import ResponseGuardrailSpec
-from llama_stack.apis.agents.openai_responses import (
+from llama_stack_api import (
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletionContentPartImageParam,
+    OpenAIChatCompletionContentPartParam,
+    OpenAIChatCompletionContentPartTextParam,
+    OpenAIChatCompletionToolCall,
+    OpenAIChatCompletionToolCallFunction,
+    OpenAIChoice,
+    OpenAIDeveloperMessageParam,
+    OpenAIImageURL,
+    OpenAIJSONSchema,
+    OpenAIMessageParam,
    OpenAIResponseAnnotationFileCitation,
+    OpenAIResponseFormatJSONObject,
+    OpenAIResponseFormatJSONSchema,
+    OpenAIResponseFormatParam,
+    OpenAIResponseFormatText,
    OpenAIResponseInput,
    OpenAIResponseInputFunctionToolCallOutput,
    OpenAIResponseInputMessageContent,
@ -27,28 +41,12 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseOutputMessageMCPCall,
    OpenAIResponseOutputMessageMCPListTools,
    OpenAIResponseText,
-)
-from llama_stack.apis.inference import (
-    OpenAIAssistantMessageParam,
-    OpenAIChatCompletionContentPartImageParam,
-    OpenAIChatCompletionContentPartParam,
-    OpenAIChatCompletionContentPartTextParam,
-    OpenAIChatCompletionToolCall,
-    OpenAIChatCompletionToolCallFunction,
-    OpenAIChoice,
-    OpenAIDeveloperMessageParam,
-    OpenAIImageURL,
-    OpenAIJSONSchema,
-    OpenAIMessageParam,
-    OpenAIResponseFormatJSONObject,
-    OpenAIResponseFormatJSONSchema,
-    OpenAIResponseFormatParam,
-    OpenAIResponseFormatText,
    OpenAISystemMessageParam,
    OpenAIToolMessageParam,
    OpenAIUserMessageParam,
+    ResponseGuardrailSpec,
+    Safety,
 )
-from llama_stack.apis.safety import Safety


 async def convert_chat_choice_to_response_message(
--- a/src/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/safety.py
@ -6,8 +6,8 @@

 import asyncio

-from llama_stack.apis.inference import OpenAIMessageParam
-from llama_stack.apis.safety import Safety, SafetyViolation, ViolationLevel
+from llama_stack_api import OpenAIMessageParam, Safety, SafetyViolation, ViolationLevel
+
 from llama_stack.core.telemetry import tracing
 from llama_stack.log import get_logger

--- a/src/llama_stack/providers/inline/batches/reference/init.py
+++ b/src/llama_stack/providers/inline/batches/reference/init.py
@ -6,9 +6,8 @@

 from typing import Any

-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.models import Models
+from llama_stack_api import Files, Inference, Models
+
 from llama_stack.core.datatypes import AccessRule, Api
 from llama_stack.providers.utils.kvstore import kvstore_impl

--- a/src/llama_stack/providers/inline/batches/reference/batches.py
+++ b/src/llama_stack/providers/inline/batches/reference/batches.py
@ -13,25 +13,29 @@ import uuid
 from io import BytesIO
 from typing import Any, Literal

-from openai.types.batch import BatchError, Errors
-from pydantic import BaseModel
-
-from llama_stack.apis.batches import Batches, BatchObject, ListBatchesResponse
-from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
-from llama_stack.apis.files import Files, OpenAIFilePurpose
-from llama_stack.apis.inference import (
+from llama_stack_api import (
+    Batches,
+    BatchObject,
+    ConflictError,
+    Files,
    Inference,
+    ListBatchesResponse,
+    Models,
    OpenAIAssistantMessageParam,
    OpenAIChatCompletionRequestWithExtraBody,
    OpenAICompletionRequestWithExtraBody,
    OpenAIDeveloperMessageParam,
    OpenAIEmbeddingsRequestWithExtraBody,
+    OpenAIFilePurpose,
    OpenAIMessageParam,
    OpenAISystemMessageParam,
    OpenAIToolMessageParam,
    OpenAIUserMessageParam,
+    ResourceNotFoundError,
 )
-from llama_stack.apis.models import Models
+from openai.types.batch import BatchError, Errors
+from pydantic import BaseModel
+
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.kvstore import KVStore

--- a/src/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ b/src/llama_stack/providers/inline/datasetio/localfs/datasetio.py
@ -5,10 +5,8 @@
 # the root directory of this source tree.
 from typing import Any

-from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Dataset
-from llama_stack.providers.datatypes import DatasetsProtocolPrivate
+from llama_stack_api import Dataset, DatasetIO, DatasetsProtocolPrivate, PaginatedResponse
+
 from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.pagination import paginate_records
--- a/src/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/src/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -6,26 +6,29 @@
 import json
 from typing import Any

-from tqdm import tqdm
-
-from llama_stack.apis.agents import Agents
-from llama_stack.apis.benchmarks import Benchmark
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.inference import (
+from llama_stack_api import (
+    Agents,
+    Benchmark,
+    BenchmarkConfig,
+    BenchmarksProtocolPrivate,
+    DatasetIO,
+    Datasets,
+    Eval,
+    EvaluateResponse,
    Inference,
+    Job,
+    JobStatus,
    OpenAIChatCompletionRequestWithExtraBody,
    OpenAICompletionRequestWithExtraBody,
    OpenAISystemMessageParam,
    OpenAIUserMessageParam,
+    Scoring,
 )
-from llama_stack.apis.scoring import Scoring
-from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
+from tqdm import tqdm
+
 from llama_stack.providers.utils.common.data_schema_validator import ColumnName
 from llama_stack.providers.utils.kvstore import kvstore_impl

-from .....apis.common.job_types import Job, JobStatus
-from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse
 from .config import MetaReferenceEvalConfig

 EVAL_TASKS_PREFIX = "benchmarks:"
--- a/src/llama_stack/providers/inline/files/localfs/files.py
+++ b/src/llama_stack/providers/inline/files/localfs/files.py
@ -10,17 +10,17 @@ from pathlib import Path
 from typing import Annotated

 from fastapi import Depends, File, Form, Response, UploadFile
-
-from llama_stack.apis.common.errors import ResourceNotFoundError
-from llama_stack.apis.common.responses import Order
-from llama_stack.apis.files import (
+from llama_stack_api import (
    ExpiresAfter,
    Files,
    ListOpenAIFileResponse,
    OpenAIFileDeleteResponse,
    OpenAIFileObject,
    OpenAIFilePurpose,
+    Order,
+    ResourceNotFoundError,
 )
+
 from llama_stack.core.datatypes import AccessRule
 from llama_stack.core.id_generation import generate_object_id
 from llama_stack.log import get_logger
--- a/src/llama_stack/providers/inline/inference/meta_reference/config.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/config.py
@ -6,9 +6,9 @@

 from typing import Any

+from llama_stack_api import QuantizationConfig
 from pydantic import BaseModel, field_validator

-from llama_stack.apis.inference import QuantizationConfig
 from llama_stack.providers.utils.inference import supported_inference_models


--- a/src/llama_stack/providers/inline/inference/meta_reference/generators.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/generators.py
@ -8,9 +8,7 @@ import math
 from typing import Optional

 import torch
-from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
-
-from llama_stack.apis.inference import (
+from llama_stack_api import (
    GreedySamplingStrategy,
    JsonSchemaResponseFormat,
    OpenAIChatCompletionRequestWithExtraBody,
@ -20,6 +18,8 @@ from llama_stack.apis.inference import (
    SamplingParams,
    TopPSamplingStrategy,
 )
+from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
+
 from llama_stack.models.llama.datatypes import QuantizationMode, ToolPromptFormat
 from llama_stack.models.llama.llama3.generation import Llama3
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
--- a/src/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -9,22 +9,23 @@ import time
 import uuid
 from collections.abc import AsyncIterator

-from llama_stack.apis.inference import (
+from llama_stack_api import (
    InferenceProvider,
+    Model,
+    ModelsProtocolPrivate,
+    ModelType,
    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
    OpenAIChatCompletionRequestWithExtraBody,
    OpenAIChatCompletionUsage,
    OpenAIChoice,
+    OpenAICompletion,
    OpenAICompletionRequestWithExtraBody,
    OpenAIUserMessageParam,
    ToolChoice,
 )
-from llama_stack.apis.inference.inference import (
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAICompletion,
-)
-from llama_stack.apis.models import Model, ModelType
+
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import RawMessage, RawTextItem, ToolDefinition
 from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
@ -40,7 +41,6 @@ from llama_stack.models.llama.llama4.prompt_templates.system_prompts import (
 from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.models.llama.sku_types import ModelFamily, is_multimodal
-from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
    SentenceTransformerEmbeddingMixin,
 )
@ -376,7 +376,7 @@ class MetaReferenceInferenceImpl(
        # Convert tool calls to OpenAI format
        openai_tool_calls = None
        if decoded_message.tool_calls:
-            from llama_stack.apis.inference import (
+            from llama_stack_api import (
                OpenAIChatCompletionToolCall,
                OpenAIChatCompletionToolCallFunction,
            )
@ -441,13 +441,14 @@ class MetaReferenceInferenceImpl(
        params: OpenAIChatCompletionRequestWithExtraBody,
    ) -> AsyncIterator[OpenAIChatCompletionChunk]:
        """Stream chat completion chunks as they're generated."""
-        from llama_stack.apis.inference import (
+        from llama_stack_api import (
            OpenAIChatCompletionChunk,
            OpenAIChatCompletionToolCall,
            OpenAIChatCompletionToolCallFunction,
            OpenAIChoiceDelta,
            OpenAIChunkChoice,
        )
+
        from llama_stack.models.llama.datatypes import StopReason
        from llama_stack.providers.utils.inference.prompt_adapter import decode_assistant_message

--- a/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -6,19 +6,19 @@

 from collections.abc import AsyncIterator

-from llama_stack.apis.inference import (
+from llama_stack_api import (
    InferenceProvider,
-    OpenAIChatCompletionRequestWithExtraBody,
-    OpenAICompletionRequestWithExtraBody,
-)
-from llama_stack.apis.inference.inference import (
+    Model,
+    ModelsProtocolPrivate,
+    ModelType,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionRequestWithExtraBody,
    OpenAICompletion,
+    OpenAICompletionRequestWithExtraBody,
 )
-from llama_stack.apis.models import ModelType
+
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import Model, ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
    SentenceTransformerEmbeddingMixin,
 )
--- a/src/llama_stack/providers/inline/post_training/common/validator.py
+++ b/src/llama_stack/providers/inline/post_training/common/validator.py
@ -12,11 +12,8 @@

 from typing import Any

-from llama_stack.apis.common.type_system import (
-    ChatCompletionInputType,
-    DialogType,
-    StringType,
-)
+from llama_stack_api import ChatCompletionInputType, DialogType, StringType
+
 from llama_stack.providers.utils.common.data_schema_validator import (
    ColumnName,
 )
--- a/src/llama_stack/providers/inline/post_training/huggingface/post_training.py
+++ b/src/llama_stack/providers/inline/post_training/huggingface/post_training.py
@ -6,11 +6,11 @@
 from enum import Enum
 from typing import Any

-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.post_training import (
+from llama_stack_api import (
    AlgorithmConfig,
    Checkpoint,
+    DatasetIO,
+    Datasets,
    DPOAlignmentConfig,
    JobStatus,
    ListPostTrainingJobsResponse,
@ -19,6 +19,7 @@ from llama_stack.apis.post_training import (
    PostTrainingJobStatusResponse,
    TrainingConfig,
 )
+
 from llama_stack.providers.inline.post_training.huggingface.config import (
    HuggingFacePostTrainingConfig,
 )
--- a/src/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py
+++ b/src/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py
@ -12,20 +12,20 @@ from typing import Any

 import torch
 from datasets import Dataset
+from llama_stack_api import (
+    Checkpoint,
+    DataConfig,
+    DatasetIO,
+    Datasets,
+    LoraFinetuningConfig,
+    TrainingConfig,
+)
 from peft import LoraConfig
 from transformers import (
    AutoTokenizer,
 )
 from trl import SFTConfig, SFTTrainer

-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.post_training import (
-    Checkpoint,
-    DataConfig,
-    LoraFinetuningConfig,
-    TrainingConfig,
-)
 from llama_stack.log import get_logger
 from llama_stack.providers.inline.post_training.common.utils import evacuate_model_from_device

--- a/src/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py
+++ b/src/llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py
@ -11,18 +11,18 @@ from typing import Any

 import torch
 from datasets import Dataset
+from llama_stack_api import (
+    Checkpoint,
+    DatasetIO,
+    Datasets,
+    DPOAlignmentConfig,
+    TrainingConfig,
+)
 from transformers import (
    AutoTokenizer,
 )
 from trl import DPOConfig, DPOTrainer

-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.post_training import (
-    Checkpoint,
-    DPOAlignmentConfig,
-    TrainingConfig,
-)
 from llama_stack.log import get_logger
 from llama_stack.providers.inline.post_training.common.utils import evacuate_model_from_device

--- a/src/llama_stack/providers/inline/post_training/huggingface/utils.py
+++ b/src/llama_stack/providers/inline/post_training/huggingface/utils.py
@ -14,6 +14,7 @@ from typing import TYPE_CHECKING, Any, Protocol
 import psutil
 import torch
 from datasets import Dataset
+from llama_stack_api import Checkpoint, DatasetIO, TrainingConfig
 from transformers import AutoConfig, AutoModelForCausalLM

 if TYPE_CHECKING:
@ -34,8 +35,6 @@ class HFAutoModel(Protocol):
    def save_pretrained(self, save_directory: str | Path) -> None: ...


-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.post_training import Checkpoint, TrainingConfig
 from llama_stack.log import get_logger

 from .config import HuggingFacePostTrainingConfig
--- a/src/llama_stack/providers/inline/post_training/torchtune/common/utils.py
+++ b/src/llama_stack/providers/inline/post_training/torchtune/common/utils.py
@ -13,6 +13,7 @@
 from collections.abc import Callable

 import torch
+from llama_stack_api import DatasetFormat
 from pydantic import BaseModel
 from torchtune.data._messages import InputOutputToMessages, ShareGPTToMessages
 from torchtune.models.llama3 import llama3_tokenizer
@ -21,7 +22,6 @@ from torchtune.models.llama3_1 import lora_llama3_1_8b
 from torchtune.models.llama3_2 import lora_llama3_2_3b
 from torchtune.modules.transforms import Transform

-from llama_stack.apis.post_training import DatasetFormat
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.models.llama.sku_types import Model

--- a/src/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/src/llama_stack/providers/inline/post_training/torchtune/post_training.py
@ -6,11 +6,11 @@
 from enum import Enum
 from typing import Any

-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.post_training import (
+from llama_stack_api import (
    AlgorithmConfig,
    Checkpoint,
+    DatasetIO,
+    Datasets,
    DPOAlignmentConfig,
    JobStatus,
    ListPostTrainingJobsResponse,
@ -20,6 +20,7 @@ from llama_stack.apis.post_training import (
    PostTrainingJobStatusResponse,
    TrainingConfig,
 )
+
 from llama_stack.providers.inline.post_training.torchtune.config import (
    TorchtunePostTrainingConfig,
 )
--- a/src/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/src/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@ -12,6 +12,17 @@ from pathlib import Path
 from typing import Any

 import torch
+from llama_stack_api import (
+    Checkpoint,
+    DataConfig,
+    DatasetIO,
+    Datasets,
+    LoraFinetuningConfig,
+    OptimizerConfig,
+    PostTrainingMetric,
+    QATFinetuningConfig,
+    TrainingConfig,
+)
 from torch import nn
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader, DistributedSampler
@ -32,17 +43,6 @@ from torchtune.training.lr_schedulers import get_cosine_schedule_with_warmup
 from torchtune.training.metric_logging import DiskLogger
 from tqdm import tqdm

-from llama_stack.apis.common.training_types import PostTrainingMetric
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.post_training import (
-    Checkpoint,
-    DataConfig,
-    LoraFinetuningConfig,
-    OptimizerConfig,
-    QATFinetuningConfig,
-    TrainingConfig,
-)
 from llama_stack.core.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
 from llama_stack.core.utils.model_utils import model_local_dir
 from llama_stack.log import get_logger
--- a/src/llama_stack/providers/inline/safety/code_scanner/code_scanner.py
+++ b/src/llama_stack/providers/inline/safety/code_scanner/code_scanner.py
@ -10,15 +10,17 @@ from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
    from codeshield.cs import CodeShieldScanResult

-from llama_stack.apis.inference import OpenAIMessageParam
-from llama_stack.apis.safety import (
+from llama_stack_api import (
+    ModerationObject,
+    ModerationObjectResults,
+    OpenAIMessageParam,
    RunShieldResponse,
    Safety,
    SafetyViolation,
+    Shield,
    ViolationLevel,
 )
-from llama_stack.apis.safety.safety import ModerationObject, ModerationObjectResults
-from llama_stack.apis.shields import Shield
+
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.prompt_adapter import (
    interleaved_content_as_str,
--- a/src/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/src/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@ -9,26 +9,27 @@ import uuid
 from string import Template
 from typing import Any

-from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem
-from llama_stack.apis.inference import (
+from llama_stack_api import (
+    ImageContentItem,
    Inference,
+    ModerationObject,
+    ModerationObjectResults,
    OpenAIChatCompletionRequestWithExtraBody,
    OpenAIMessageParam,
    OpenAIUserMessageParam,
-)
-from llama_stack.apis.safety import (
    RunShieldResponse,
    Safety,
    SafetyViolation,
+    Shield,
+    ShieldsProtocolPrivate,
+    TextContentItem,
    ViolationLevel,
 )
-from llama_stack.apis.safety.safety import ModerationObject, ModerationObjectResults
-from llama_stack.apis.shields import Shield
+
 from llama_stack.core.datatypes import Api
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import Role
 from llama_stack.models.llama.sku_types import CoreModelId
-from llama_stack.providers.datatypes import ShieldsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import (
    interleaved_content_as_str,
 )
--- a/src/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
+++ b/src/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
@ -7,21 +7,21 @@
 from typing import Any

 import torch
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-
-from llama_stack.apis.inference import OpenAIMessageParam
-from llama_stack.apis.safety import (
+from llama_stack_api import (
+    ModerationObject,
+    OpenAIMessageParam,
    RunShieldResponse,
    Safety,
    SafetyViolation,
+    Shield,
+    ShieldsProtocolPrivate,
    ShieldStore,
    ViolationLevel,
 )
-from llama_stack.apis.safety.safety import ModerationObject
-from llama_stack.apis.shields import Shield
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
 from llama_stack.core.utils.model_utils import model_local_dir
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import ShieldsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str

 from .config import PromptGuardConfig, PromptGuardType
--- a/src/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring.py
@ -5,17 +5,19 @@
 # the root directory of this source tree.
 from typing import Any

-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.scoring import (
+from llama_stack_api import (
+    DatasetIO,
+    Datasets,
    ScoreBatchResponse,
    ScoreResponse,
    Scoring,
+    ScoringFn,
+    ScoringFnParams,
+    ScoringFunctionsProtocolPrivate,
    ScoringResult,
 )
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
+
 from llama_stack.core.datatypes import Api
-from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate
 from llama_stack.providers.utils.common.data_schema_validator import (
    get_valid_schemas,
    validate_dataset_schema,
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
@ -8,8 +8,8 @@ import json
 import re
 from typing import Any

-from llama_stack.apis.scoring import ScoringResultRow
-from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack_api import ScoringFnParams, ScoringResultRow
+
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn

 from .fn_defs.docvqa import docvqa
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
@ -6,8 +6,8 @@

 from typing import Any

-from llama_stack.apis.scoring import ScoringResultRow
-from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack_api import ScoringFnParams, ScoringResultRow
+
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn

 from .fn_defs.equality import equality
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
@ -4,9 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
+    NumberType,
    RegexParserScoringFnParams,
    ScoringFn,
 )
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
@ -4,9 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
+    NumberType,
    RegexParserScoringFnParams,
    ScoringFn,
 )
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
@ -6,8 +6,8 @@

 from typing import Any

-from llama_stack.apis.scoring import ScoringResultRow
-from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack_api import ScoringFnParams, ScoringResultRow
+
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn

 from .fn_defs.ifeval import (
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
@ -5,8 +5,8 @@
 # the root directory of this source tree.
 from typing import Any

-from llama_stack.apis.scoring import ScoringResultRow
-from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType
+from llama_stack_api import ScoringFnParams, ScoringFnParamsType, ScoringResultRow
+
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn

 from ..utils.math_utils import first_answer, normalize_final_answer, try_evaluate_frac, try_evaluate_latex
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
@ -6,8 +6,8 @@
 import re
 from typing import Any

-from llama_stack.apis.scoring import ScoringResultRow
-from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType
+from llama_stack_api import ScoringFnParams, ScoringFnParamsType, ScoringResultRow
+
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn

 from .fn_defs.regex_parser_multiple_choice_answer import (
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
+++ b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
@ -6,8 +6,8 @@

 from typing import Any

-from llama_stack.apis.scoring import ScoringResultRow
-from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack_api import ScoringFnParams, ScoringResultRow
+
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn

 from .fn_defs.subset_of import subset_of
--- a/src/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ b/src/llama_stack/providers/inline/scoring/braintrust/braintrust.py
@ -17,21 +17,22 @@ from autoevals.ragas import (
    ContextRelevancy,
    Faithfulness,
 )
-from pydantic import BaseModel
-
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.scoring import (
+from llama_stack_api import (
+    DatasetIO,
+    Datasets,
    ScoreBatchResponse,
    ScoreResponse,
    Scoring,
+    ScoringFn,
+    ScoringFnParams,
+    ScoringFunctionsProtocolPrivate,
    ScoringResult,
    ScoringResultRow,
 )
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
+from pydantic import BaseModel
+
 from llama_stack.core.datatypes import Api
 from llama_stack.core.request_headers import NeedsRequestProviderData
-from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate
 from llama_stack.providers.utils.common.data_schema_validator import (
    get_valid_schemas,
    validate_dataset_schema,
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
+++ b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py
+++ b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py
+++ b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py
+++ b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py
+++ b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py
+++ b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py
+++ b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
+++ b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py
+++ b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    BasicScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@ -5,18 +5,20 @@
 # the root directory of this source tree.
 from typing import Any

-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.scoring import (
+from llama_stack_api import (
+    DatasetIO,
+    Datasets,
+    Inference,
    ScoreBatchResponse,
    ScoreResponse,
    Scoring,
+    ScoringFn,
+    ScoringFnParams,
+    ScoringFunctionsProtocolPrivate,
    ScoringResult,
 )
-from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
+
 from llama_stack.core.datatypes import Api
-from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate
 from llama_stack.providers.utils.common.data_schema_validator import (
    get_valid_schemas,
    validate_dataset_schema,
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py
+++ b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
+from llama_stack_api import (
    AggregationFunctionType,
    LLMAsJudgeScoringFnParams,
+    NumberType,
    ScoringFn,
 )

--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py
+++ b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py
@ -4,8 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams, ScoringFn
+from llama_stack_api import LLMAsJudgeScoringFnParams, NumberType, ScoringFn

 llm_as_judge_base = ScoringFn(
    identifier="llm-as-judge::base",
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
+++ b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
@ -6,9 +6,8 @@
 import re
 from typing import Any

-from llama_stack.apis.inference import Inference, OpenAIChatCompletionRequestWithExtraBody
-from llama_stack.apis.scoring import ScoringResultRow
-from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack_api import Inference, OpenAIChatCompletionRequestWithExtraBody, ScoringFnParams, ScoringResultRow
+
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn

 from .fn_defs.llm_as_judge_405b_simpleqa import llm_as_judge_405b_simpleqa
--- a/src/llama_stack/providers/inline/tool_runtime/rag/init.py
+++ b/src/llama_stack/providers/inline/tool_runtime/rag/init.py
@ -6,7 +6,7 @@

 from typing import Any

-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 from .config import RagToolRuntimeConfig

--- a/src/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
+++ b/src/llama_stack/providers/inline/tool_runtime/rag/context_retriever.py
@ -6,15 +6,16 @@


 from jinja2 import Template
-
-from llama_stack.apis.common.content_types import InterleavedContent
-from llama_stack.apis.inference import OpenAIChatCompletionRequestWithExtraBody, OpenAIUserMessageParam
-from llama_stack.apis.tools.rag_tool import (
+from llama_stack_api import (
    DefaultRAGQueryGeneratorConfig,
+    InterleavedContent,
    LLMRAGQueryGeneratorConfig,
+    OpenAIChatCompletionRequestWithExtraBody,
+    OpenAIUserMessageParam,
    RAGQueryGenerator,
    RAGQueryGeneratorConfig,
 )
+
 from llama_stack.providers.utils.inference.prompt_adapter import (
    interleaved_content_as_str,
 )
--- a/src/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/src/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -12,34 +12,31 @@ from typing import Any

 import httpx
 from fastapi import UploadFile
-from pydantic import TypeAdapter
-
-from llama_stack.apis.common.content_types import (
+from llama_stack_api import (
    URL,
+    Files,
+    Inference,
    InterleavedContent,
    InterleavedContentItem,
-    TextContentItem,
-)
-from llama_stack.apis.files import Files, OpenAIFilePurpose
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.tools import (
    ListToolDefsResponse,
+    OpenAIFilePurpose,
+    QueryChunksResponse,
    RAGDocument,
    RAGQueryConfig,
    RAGQueryResult,
+    TextContentItem,
    ToolDef,
    ToolGroup,
+    ToolGroupsProtocolPrivate,
    ToolInvocationResult,
    ToolRuntime,
-)
-from llama_stack.apis.vector_io import (
-    QueryChunksResponse,
    VectorIO,
    VectorStoreChunkingStrategyStatic,
    VectorStoreChunkingStrategyStaticConfig,
 )
+from pydantic import TypeAdapter
+
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
 from llama_stack.providers.utils.memory.vector_store import parse_data_url

--- a/src/llama_stack/providers/inline/vector_io/chroma/init.py
+++ b/src/llama_stack/providers/inline/vector_io/chroma/init.py
@ -6,7 +6,7 @@

 from typing import Any

-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 from .config import ChromaVectorIOConfig

--- a/src/llama_stack/providers/inline/vector_io/chroma/config.py
+++ b/src/llama_stack/providers/inline/vector_io/chroma/config.py
@ -6,10 +6,10 @@

 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel, Field

 from llama_stack.core.storage.datatypes import KVStoreReference
-from llama_stack.schema_utils import json_schema_type


@json_schema_type
--- a/src/llama_stack/providers/inline/vector_io/faiss/init.py
+++ b/src/llama_stack/providers/inline/vector_io/faiss/init.py
@ -6,7 +6,7 @@

 from typing import Any

-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 from .config import FaissVectorIOConfig

--- a/src/llama_stack/providers/inline/vector_io/faiss/config.py
+++ b/src/llama_stack/providers/inline/vector_io/faiss/config.py
@ -6,10 +6,10 @@

 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel

 from llama_stack.core.storage.datatypes import KVStoreReference
-from llama_stack.schema_utils import json_schema_type


@json_schema_type
--- a/src/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/src/llama_stack/providers/inline/vector_io/faiss/faiss.py
@ -12,15 +12,22 @@ from typing import Any

 import faiss  # type: ignore[import-untyped]
 import numpy as np
+from llama_stack_api import (
+    Chunk,
+    Files,
+    HealthResponse,
+    HealthStatus,
+    Inference,
+    InterleavedContent,
+    QueryChunksResponse,
+    VectorIO,
+    VectorStore,
+    VectorStoreNotFoundError,
+    VectorStoresProtocolPrivate,
+)
 from numpy.typing import NDArray

-from llama_stack.apis.common.errors import VectorStoreNotFoundError
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference, InterleavedContent
-from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import HealthResponse, HealthStatus, VectorStoresProtocolPrivate
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
--- a/src/llama_stack/providers/inline/vector_io/milvus/init.py
+++ b/src/llama_stack/providers/inline/vector_io/milvus/init.py
@ -6,7 +6,7 @@

 from typing import Any

-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 from .config import MilvusVectorIOConfig

--- a/src/llama_stack/providers/inline/vector_io/milvus/config.py
+++ b/src/llama_stack/providers/inline/vector_io/milvus/config.py
@ -6,10 +6,10 @@

 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel, Field

 from llama_stack.core.storage.datatypes import KVStoreReference
-from llama_stack.schema_utils import json_schema_type


@json_schema_type
--- a/src/llama_stack/providers/inline/vector_io/qdrant/init.py
+++ b/src/llama_stack/providers/inline/vector_io/qdrant/init.py
@ -6,7 +6,7 @@

 from typing import Any

-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 from .config import QdrantVectorIOConfig

--- a/src/llama_stack/providers/inline/vector_io/qdrant/config.py
+++ b/src/llama_stack/providers/inline/vector_io/qdrant/config.py
@ -7,10 +7,10 @@

 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel

 from llama_stack.core.storage.datatypes import KVStoreReference
-from llama_stack.schema_utils import json_schema_type


@json_schema_type
--- a/src/llama_stack/providers/inline/vector_io/sqlite_vec/init.py
+++ b/src/llama_stack/providers/inline/vector_io/sqlite_vec/init.py
@ -6,7 +6,7 @@

 from typing import Any

-from llama_stack.providers.datatypes import Api
+from llama_stack_api import Api

 from .config import SQLiteVectorIOConfig

--- a/src/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/src/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -12,15 +12,19 @@ from typing import Any

 import numpy as np
 import sqlite_vec  # type: ignore[import-untyped]
+from llama_stack_api import (
+    Chunk,
+    Files,
+    Inference,
+    QueryChunksResponse,
+    VectorIO,
+    VectorStore,
+    VectorStoreNotFoundError,
+    VectorStoresProtocolPrivate,
+)
 from numpy.typing import NDArray

-from llama_stack.apis.common.errors import VectorStoreNotFoundError
-from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import VectorStoresProtocolPrivate
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
--- a/src/llama_stack/providers/registry/agents.py
+++ b/src/llama_stack/providers/registry/agents.py
@ -5,11 +5,12 @@
 # the root directory of this source tree.


-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    InlineProviderSpec,
    ProviderSpec,
 )
+
 from llama_stack.providers.utils.kvstore import kvstore_dependencies


--- a/src/llama_stack/providers/registry/batches.py
+++ b/src/llama_stack/providers/registry/batches.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.


-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
+from llama_stack_api import Api, InlineProviderSpec, ProviderSpec


 def available_providers() -> list[ProviderSpec]:
--- a/src/llama_stack/providers/registry/datasetio.py
+++ b/src/llama_stack/providers/registry/datasetio.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.


-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    InlineProviderSpec,
    ProviderSpec,
--- a/src/llama_stack/providers/registry/eval.py
+++ b/src/llama_stack/providers/registry/eval.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.


-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec
+from llama_stack_api import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec


 def available_providers() -> list[ProviderSpec]:
--- a/src/llama_stack/providers/registry/files.py
+++ b/src/llama_stack/providers/registry/files.py
@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec
+from llama_stack_api import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec
+
 from llama_stack.providers.utils.sqlstore.sqlstore import sql_store_pip_packages


--- a/src/llama_stack/providers/registry/inference.py
+++ b/src/llama_stack/providers/registry/inference.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.


-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    InlineProviderSpec,
    ProviderSpec,
--- a/src/llama_stack/providers/registry/post_training.py
+++ b/src/llama_stack/providers/registry/post_training.py
@ -7,7 +7,7 @@

 from typing import cast

-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec
+from llama_stack_api import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec

 # We provide two versions of these providers so that distributions can package the appropriate version of torch.
 # The CPU version is used for distributions that don't have GPU support -- they result in smaller container images.
--- a/src/llama_stack/providers/registry/safety.py
+++ b/src/llama_stack/providers/registry/safety.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.


-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    InlineProviderSpec,
    ProviderSpec,
--- a/src/llama_stack/providers/registry/scoring.py
+++ b/src/llama_stack/providers/registry/scoring.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.


-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
+from llama_stack_api import Api, InlineProviderSpec, ProviderSpec


 def available_providers() -> list[ProviderSpec]:
--- a/src/llama_stack/providers/registry/tool_runtime.py
+++ b/src/llama_stack/providers/registry/tool_runtime.py
@ -5,12 +5,13 @@
 # the root directory of this source tree.


-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    InlineProviderSpec,
    ProviderSpec,
    RemoteProviderSpec,
 )
+
 from llama_stack.providers.registry.vector_io import DEFAULT_VECTOR_IO_DEPS


--- a/src/llama_stack/providers/registry/vector_io.py
+++ b/src/llama_stack/providers/registry/vector_io.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.


-from llama_stack.providers.datatypes import (
+from llama_stack_api import (
    Api,
    InlineProviderSpec,
    ProviderSpec,
@ -244,7 +244,7 @@ Two ranker types are supported:
 Example using RAGQueryConfig with different search modes:

 ```python
-from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
+from llama_stack_api import RAGQueryConfig, RRFRanker, WeightedRanker

 # Vector search
 config = RAGQueryConfig(mode="vector", max_chunks=5)
--- a/src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ b/src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
@ -6,10 +6,8 @@
 from typing import Any
 from urllib.parse import parse_qs, urlparse

-from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Dataset
-from llama_stack.providers.datatypes import DatasetsProtocolPrivate
+from llama_stack_api import Dataset, DatasetIO, DatasetsProtocolPrivate, PaginatedResponse
+
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.pagination import paginate_records

--- a/src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
+++ b/src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
@ -7,11 +7,7 @@
 from typing import Any

 import aiohttp
-
-from llama_stack.apis.common.content_types import URL
-from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.apis.common.type_system import ParamType
-from llama_stack.apis.datasets import Dataset
+from llama_stack_api import URL, Dataset, PaginatedResponse, ParamType

 from .config import NvidiaDatasetIOConfig

--- a/src/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/src/llama_stack/providers/remote/eval/nvidia/eval.py
@ -6,18 +6,24 @@
 from typing import Any

 import requests
+from llama_stack_api import (
+    Agents,
+    Benchmark,
+    BenchmarkConfig,
+    BenchmarksProtocolPrivate,
+    DatasetIO,
+    Datasets,
+    Eval,
+    EvaluateResponse,
+    Inference,
+    Job,
+    JobStatus,
+    Scoring,
+    ScoringResult,
+)

-from llama_stack.apis.agents import Agents
-from llama_stack.apis.benchmarks import Benchmark
-from llama_stack.apis.datasetio import DatasetIO
-from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.scoring import Scoring, ScoringResult
-from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper

-from .....apis.common.job_types import Job, JobStatus
-from .....apis.eval.eval import BenchmarkConfig, Eval, EvaluateResponse
 from .config import NVIDIAEvalConfig

 DEFAULT_NAMESPACE = "nvidia"
--- a/src/llama_stack/providers/remote/files/openai/files.py
+++ b/src/llama_stack/providers/remote/files/openai/files.py
@ -8,17 +8,17 @@ from datetime import UTC, datetime
 from typing import Annotated, Any

 from fastapi import Depends, File, Form, Response, UploadFile
-
-from llama_stack.apis.common.errors import ResourceNotFoundError
-from llama_stack.apis.common.responses import Order
-from llama_stack.apis.files import (
+from llama_stack_api import (
    ExpiresAfter,
    Files,
    ListOpenAIFileResponse,
    OpenAIFileDeleteResponse,
    OpenAIFileObject,
    OpenAIFilePurpose,
+    Order,
+    ResourceNotFoundError,
 )
+
 from llama_stack.core.datatypes import AccessRule
 from llama_stack.providers.utils.files.form_data import parse_expires_after
 from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
--- a/src/llama_stack/providers/remote/files/s3/files.py
+++ b/src/llama_stack/providers/remote/files/s3/files.py
@ -17,16 +17,17 @@ from fastapi import Depends, File, Form, Response, UploadFile
 if TYPE_CHECKING:
    from mypy_boto3_s3.client import S3Client

-from llama_stack.apis.common.errors import ResourceNotFoundError
-from llama_stack.apis.common.responses import Order
-from llama_stack.apis.files import (
+from llama_stack_api import (
    ExpiresAfter,
    Files,
    ListOpenAIFileResponse,
    OpenAIFileDeleteResponse,
    OpenAIFileObject,
    OpenAIFilePurpose,
+    Order,
+    ResourceNotFoundError,
 )
+
 from llama_stack.core.datatypes import AccessRule
 from llama_stack.core.id_generation import generate_object_id
 from llama_stack.providers.utils.files.form_data import parse_expires_after
--- a/src/llama_stack/providers/remote/inference/anthropic/config.py
+++ b/src/llama_stack/providers/remote/inference/anthropic/config.py
@ -6,10 +6,10 @@

 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel, Field

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-from llama_stack.schema_utils import json_schema_type


 class AnthropicProviderDataValidator(BaseModel):
--- a/src/llama_stack/providers/remote/inference/azure/config.py
+++ b/src/llama_stack/providers/remote/inference/azure/config.py
@ -7,10 +7,10 @@
 import os
 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel, Field, HttpUrl, SecretStr

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-from llama_stack.schema_utils import json_schema_type


 class AzureProviderDataValidator(BaseModel):
--- a/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -6,9 +6,7 @@

 from collections.abc import AsyncIterator, Iterable

-from openai import AuthenticationError
-
-from llama_stack.apis.inference import (
+from llama_stack_api import (
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
    OpenAIChatCompletionRequestWithExtraBody,
@ -17,6 +15,8 @@ from llama_stack.apis.inference import (
    OpenAIEmbeddingsRequestWithExtraBody,
    OpenAIEmbeddingsResponse,
 )
+from openai import AuthenticationError
+
 from llama_stack.core.telemetry.tracing import get_current_span
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
--- a/src/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/src/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -6,10 +6,11 @@

 from urllib.parse import urljoin

-from llama_stack.apis.inference import (
+from llama_stack_api import (
    OpenAIEmbeddingsRequestWithExtraBody,
    OpenAIEmbeddingsResponse,
 )
+
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import CerebrasImplConfig
--- a/src/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/src/llama_stack/providers/remote/inference/cerebras/config.py
@ -7,10 +7,10 @@
 import os
 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel, Field

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-from llama_stack.schema_utils import json_schema_type

 DEFAULT_BASE_URL = "https://api.cerebras.ai"

--- a/src/llama_stack/providers/remote/inference/databricks/config.py
+++ b/src/llama_stack/providers/remote/inference/databricks/config.py
@ -6,10 +6,10 @@

 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel, Field, SecretStr

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-from llama_stack.schema_utils import json_schema_type


 class DatabricksProviderDataValidator(BaseModel):
--- a/src/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/src/llama_stack/providers/remote/inference/databricks/databricks.py
@ -7,8 +7,8 @@
 from collections.abc import Iterable

 from databricks.sdk import WorkspaceClient
+from llama_stack_api import OpenAICompletion, OpenAICompletionRequestWithExtraBody

-from llama_stack.apis.inference import OpenAICompletion, OpenAICompletionRequestWithExtraBody
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

--- a/src/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/src/llama_stack/providers/remote/inference/fireworks/config.py
@ -6,10 +6,10 @@

 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import Field

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-from llama_stack.schema_utils import json_schema_type


@json_schema_type
--- a/src/llama_stack/providers/remote/inference/gemini/config.py
+++ b/src/llama_stack/providers/remote/inference/gemini/config.py
@ -6,10 +6,10 @@

 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel, Field

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-from llama_stack.schema_utils import json_schema_type


 class GeminiProviderDataValidator(BaseModel):
--- a/src/llama_stack/providers/remote/inference/gemini/gemini.py
+++ b/src/llama_stack/providers/remote/inference/gemini/gemini.py
@ -6,12 +6,13 @@

 from typing import Any

-from llama_stack.apis.inference import (
+from llama_stack_api import (
    OpenAIEmbeddingData,
    OpenAIEmbeddingsRequestWithExtraBody,
    OpenAIEmbeddingsResponse,
    OpenAIEmbeddingUsage,
 )
+
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import GeminiConfig
--- a/src/llama_stack/providers/remote/inference/groq/config.py
+++ b/src/llama_stack/providers/remote/inference/groq/config.py
@ -6,10 +6,10 @@

 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel, Field

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-from llama_stack.schema_utils import json_schema_type


 class GroqProviderDataValidator(BaseModel):
--- a/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py
+++ b/src/llama_stack/providers/remote/inference/llama_openai_compat/config.py
@ -6,10 +6,10 @@

 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel, Field

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-from llama_stack.schema_utils import json_schema_type


 class LlamaProviderDataValidator(BaseModel):
--- a/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/src/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@ -4,12 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.inference.inference import (
+from llama_stack_api import (
    OpenAICompletion,
    OpenAICompletionRequestWithExtraBody,
    OpenAIEmbeddingsRequestWithExtraBody,
    OpenAIEmbeddingsResponse,
 )
+
 from llama_stack.log import get_logger
 from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
--- a/src/llama_stack/providers/remote/inference/nvidia/init.py
+++ b/src/llama_stack/providers/remote/inference/nvidia/init.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.inference import Inference
+from llama_stack_api import Inference

 from .config import NVIDIAConfig

--- a/src/llama_stack/providers/remote/inference/nvidia/config.py
+++ b/src/llama_stack/providers/remote/inference/nvidia/config.py
@ -7,10 +7,10 @@
 import os
 from typing import Any

+from llama_stack_api import json_schema_type
 from pydantic import BaseModel, Field

 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-from llama_stack.schema_utils import json_schema_type


 class NVIDIAProviderDataValidator(BaseModel):
--- a/Show more
+++ b/Show more