Merge remote-tracking branch 'origin/main' into TamiTakamiya/tool-param-definition-update

2025-10-04 12:07:34 +00:00 · 2025-09-27 10:47:08 -07:00 · 2025-09-27 10:47:08 -07:00 · c1818350c8
commit c1818350c8
parent 563c8c231c 60484c5c4e
479 changed files with 74743 additions and 8997 deletions
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -27,6 +27,7 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

 from .openai_responses import (
@ -481,7 +482,7 @@ class Agents(Protocol):
    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
    """

-    @webmethod(route="/agents", method="POST", descriptive_name="create_agent")
+    @webmethod(route="/agents", method="POST", descriptive_name="create_agent", level=LLAMA_STACK_API_V1)
    async def create_agent(
        self,
        agent_config: AgentConfig,
@ -494,7 +495,10 @@ class Agents(Protocol):
        ...

    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn", method="POST", descriptive_name="create_agent_turn"
+        route="/agents/{agent_id}/session/{session_id}/turn",
+        method="POST",
+        descriptive_name="create_agent_turn",
+        level=LLAMA_STACK_API_V1,
    )
    async def create_agent_turn(
        self,
@ -524,6 +528,7 @@ class Agents(Protocol):
        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
        method="POST",
        descriptive_name="resume_agent_turn",
+        level=LLAMA_STACK_API_V1,
    )
    async def resume_agent_turn(
        self,
@ -549,6 +554,7 @@ class Agents(Protocol):
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}",
        method="GET",
+        level=LLAMA_STACK_API_V1,
    )
    async def get_agents_turn(
        self,
@ -568,6 +574,7 @@ class Agents(Protocol):
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}",
        method="GET",
+        level=LLAMA_STACK_API_V1,
    )
    async def get_agents_step(
        self,
@ -586,7 +593,12 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents/{agent_id}/session", method="POST", descriptive_name="create_agent_session")
+    @webmethod(
+        route="/agents/{agent_id}/session",
+        method="POST",
+        descriptive_name="create_agent_session",
+        level=LLAMA_STACK_API_V1,
+    )
    async def create_agent_session(
        self,
        agent_id: str,
@ -600,7 +612,7 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="GET")
+    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_agents_session(
        self,
        session_id: str,
@ -616,7 +628,7 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="DELETE")
+    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def delete_agents_session(
        self,
        session_id: str,
@ -629,7 +641,7 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents/{agent_id}", method="DELETE")
+    @webmethod(route="/agents/{agent_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def delete_agent(
        self,
        agent_id: str,
@ -640,7 +652,7 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents", method="GET")
+    @webmethod(route="/agents", method="GET", level=LLAMA_STACK_API_V1)
    async def list_agents(self, start_index: int | None = None, limit: int | None = None) -> PaginatedResponse:
        """List all agents.

@ -650,7 +662,7 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents/{agent_id}", method="GET")
+    @webmethod(route="/agents/{agent_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_agent(self, agent_id: str) -> Agent:
        """Describe an agent by its ID.

@ -659,7 +671,7 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents/{agent_id}/sessions", method="GET")
+    @webmethod(route="/agents/{agent_id}/sessions", method="GET", level=LLAMA_STACK_API_V1)
    async def list_agent_sessions(
        self,
        agent_id: str,
@ -682,7 +694,7 @@ class Agents(Protocol):
    #
    # Both of these APIs are inherently stateful.

-    @webmethod(route="/openai/v1/responses/{response_id}", method="GET")
+    @webmethod(route="/openai/v1/responses/{response_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_openai_response(
        self,
        response_id: str,
@ -694,7 +706,7 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/responses", method="POST")
+    @webmethod(route="/openai/v1/responses", method="POST", level=LLAMA_STACK_API_V1)
    async def create_openai_response(
        self,
        input: str | list[OpenAIResponseInput],
@ -719,7 +731,7 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/responses", method="GET")
+    @webmethod(route="/openai/v1/responses", method="GET", level=LLAMA_STACK_API_V1)
    async def list_openai_responses(
        self,
        after: str | None = None,
@ -737,7 +749,7 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/responses/{response_id}/input_items", method="GET")
+    @webmethod(route="/openai/v1/responses/{response_id}/input_items", method="GET", level=LLAMA_STACK_API_V1)
    async def list_openai_response_input_items(
        self,
        response_id: str,
@ -759,7 +771,7 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/responses/{response_id}", method="DELETE")
+    @webmethod(route="/openai/v1/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
        """Delete an OpenAI response by its ID.

--- a/llama_stack/apis/batch_inference/init.py
+++ b/llama_stack/apis/batch_inference/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .batch_inference import *
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@ -1,78 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Protocol, runtime_checkable
-
-from llama_stack.apis.common.job_types import Job
-from llama_stack.apis.inference import (
-    InterleavedContent,
-    LogProbConfig,
-    Message,
-    ResponseFormat,
-    SamplingParams,
-    ToolChoice,
-    ToolDefinition,
-    ToolPromptFormat,
-)
-from llama_stack.schema_utils import webmethod
-
-
-@runtime_checkable
-class BatchInference(Protocol):
-    """Batch inference API for generating completions and chat completions.
-
-    This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.
-
-    NOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs
-    including (post-training, evals, etc).
-    """
-
-    @webmethod(route="/batch-inference/completion", method="POST")
-    async def completion(
-        self,
-        model: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> Job:
-        """Generate completions for a batch of content.
-
-        :param model: The model to use for the completion.
-        :param content_batch: The content to complete.
-        :param sampling_params: The sampling parameters to use for the completion.
-        :param response_format: The response format to use for the completion.
-        :param logprobs: The logprobs to use for the completion.
-        :returns: A job for the completion.
-        """
-        ...
-
-    @webmethod(route="/batch-inference/chat-completion", method="POST")
-    async def chat_completion(
-        self,
-        model: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        # zero-shot tool definitions as input to the model
-        tools: list[ToolDefinition] | None = None,
-        tool_choice: ToolChoice | None = ToolChoice.auto,
-        tool_prompt_format: ToolPromptFormat | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> Job:
-        """Generate chat completions for a batch of messages.
-
-        :param model: The model to use for the chat completion.
-        :param messages_batch: The messages to complete.
-        :param sampling_params: The sampling parameters to use for the completion.
-        :param tools: The tools to use for the chat completion.
-        :param tool_choice: The tool choice to use for the chat completion.
-        :param tool_prompt_format: The tool prompt format to use for the chat completion.
-        :param response_format: The response format to use for the chat completion.
-        :param logprobs: The logprobs to use for the chat completion.
-        :returns: A job for the chat completion.
-        """
-        ...
--- a/llama_stack/apis/batches/batches.py
+++ b/llama_stack/apis/batches/batches.py
@ -8,6 +8,7 @@ from typing import Literal, Protocol, runtime_checkable

 from pydantic import BaseModel, Field

+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.schema_utils import json_schema_type, webmethod

 try:
@ -42,7 +43,7 @@ class Batches(Protocol):
    Note: This API is currently under active development and may undergo changes.
    """

-    @webmethod(route="/openai/v1/batches", method="POST")
+    @webmethod(route="/openai/v1/batches", method="POST", level=LLAMA_STACK_API_V1)
    async def create_batch(
        self,
        input_file_id: str,
@ -62,7 +63,7 @@ class Batches(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/batches/{batch_id}", method="GET")
+    @webmethod(route="/openai/v1/batches/{batch_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def retrieve_batch(self, batch_id: str) -> BatchObject:
        """Retrieve information about a specific batch.

@ -71,7 +72,7 @@ class Batches(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST")
+    @webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST", level=LLAMA_STACK_API_V1)
    async def cancel_batch(self, batch_id: str) -> BatchObject:
        """Cancel a batch that is in progress.

@ -80,7 +81,7 @@ class Batches(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/batches", method="GET")
+    @webmethod(route="/openai/v1/batches", method="GET", level=LLAMA_STACK_API_V1)
    async def list_batches(
        self,
        after: str | None = None,
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -8,6 +8,7 @@ from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, Field

 from llama_stack.apis.resource import Resource, ResourceType
+from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -53,7 +54,8 @@ class ListBenchmarksResponse(BaseModel):

@runtime_checkable
 class Benchmarks(Protocol):
-    @webmethod(route="/eval/benchmarks", method="GET")
+    @webmethod(route="/eval/benchmarks", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/eval/benchmarks", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def list_benchmarks(self) -> ListBenchmarksResponse:
        """List all benchmarks.

@ -61,7 +63,8 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET")
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_benchmark(
        self,
        benchmark_id: str,
@ -73,7 +76,8 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks", method="POST")
+    @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def register_benchmark(
        self,
        benchmark_id: str,
@ -94,7 +98,8 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE")
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
    async def unregister_benchmark(self, benchmark_id: str) -> None:
        """Unregister a benchmark.

--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -8,6 +8,7 @@ from typing import Any, Protocol, runtime_checkable

 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasets import Dataset
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.schema_utils import webmethod


@ -20,7 +21,7 @@ class DatasetIO(Protocol):
    # keeping for aligning with inference/safety, but this is not used
    dataset_store: DatasetStore

-    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET")
+    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1)
    async def iterrows(
        self,
        dataset_id: str,
@ -44,7 +45,7 @@ class DatasetIO(Protocol):
        """
        ...

-    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST")
+    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST", level=LLAMA_STACK_API_V1)
    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
        """Append rows to a dataset.

--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -10,6 +10,7 @@ from typing import Annotated, Any, Literal, Protocol
 from pydantic import BaseModel, Field

 from llama_stack.apis.resource import Resource, ResourceType
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -145,7 +146,7 @@ class ListDatasetsResponse(BaseModel):


 class Datasets(Protocol):
-    @webmethod(route="/datasets", method="POST")
+    @webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1)
    async def register_dataset(
        self,
        purpose: DatasetPurpose,
@ -214,7 +215,7 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets/{dataset_id:path}", method="GET")
+    @webmethod(route="/datasets/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_dataset(
        self,
        dataset_id: str,
@ -226,7 +227,7 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets", method="GET")
+    @webmethod(route="/datasets", method="GET", level=LLAMA_STACK_API_V1)
    async def list_datasets(self) -> ListDatasetsResponse:
        """List all datasets.

@ -234,7 +235,7 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE")
+    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def unregister_dataset(
        self,
        dataset_id: str,
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -13,6 +13,7 @@ from llama_stack.apis.common.job_types import Job
 from llama_stack.apis.inference import SamplingParams, SystemMessage
 from llama_stack.apis.scoring import ScoringResult
 from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -83,7 +84,8 @@ class EvaluateResponse(BaseModel):
 class Eval(Protocol):
    """Llama Stack Evaluation API for running evaluations on model and agent candidates."""

-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def run_eval(
        self,
        benchmark_id: str,
@ -97,7 +99,10 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
+    @webmethod(
+        route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True
+    )
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def evaluate_rows(
        self,
        benchmark_id: str,
@ -115,7 +120,10 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
+    @webmethod(
+        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
+    )
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def job_status(self, benchmark_id: str, job_id: str) -> Job:
        """Get the status of a job.

@ -125,7 +133,13 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
+    @webmethod(
+        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
+        method="DELETE",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
+    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
        """Cancel a job.

@ -134,7 +148,15 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
+    @webmethod(
+        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result",
+        method="GET",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
+    @webmethod(
+        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET", level=LLAMA_STACK_API_V1ALPHA
+    )
    async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
        """Get the result of a job.

--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -11,6 +11,7 @@ from fastapi import File, Form, Response, UploadFile
 from pydantic import BaseModel, Field

 from llama_stack.apis.common.responses import Order
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod

@ -104,7 +105,7 @@ class OpenAIFileDeleteResponse(BaseModel):
@trace_protocol
 class Files(Protocol):
    # OpenAI Files API Endpoints
-    @webmethod(route="/openai/v1/files", method="POST")
+    @webmethod(route="/openai/v1/files", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_upload_file(
        self,
        file: Annotated[UploadFile, File()],
@ -119,7 +120,7 @@ class Files(Protocol):
        The file upload should be a multipart form request with:
        - file: The File object (not file name) to be uploaded.
        - purpose: The intended purpose of the uploaded file.
-        - expires_after: Optional form values describing expiration for the file. Expected expires_after[anchor] = "created_at", expires_after[seconds] = <int>. Seconds must be between 3600 and 2592000 (1 hour to 30 days).
+        - expires_after: Optional form values describing expiration for the file. Expected expires_after[anchor] = "created_at", expires_after[seconds] = {integer}. Seconds must be between 3600 and 2592000 (1 hour to 30 days).

        :param file: The uploaded file object containing content and metadata (filename, content_type, etc.).
        :param purpose: The intended purpose of the uploaded file (e.g., "assistants", "fine-tune").
@ -127,7 +128,7 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files", method="GET")
+    @webmethod(route="/openai/v1/files", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_files(
        self,
        after: str | None = None,
@ -146,7 +147,7 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files/{file_id}", method="GET")
+    @webmethod(route="/openai/v1/files/{file_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_file(
        self,
        file_id: str,
@ -159,7 +160,7 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files/{file_id}", method="DELETE")
+    @webmethod(route="/openai/v1/files/{file_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def openai_delete_file(
        self,
        file_id: str,
@ -172,7 +173,7 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files/{file_id}/content", method="GET")
+    @webmethod(route="/openai/v1/files/{file_id}/content", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_file_content(
        self,
        file_id: str,
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -21,6 +21,7 @@ from llama_stack.apis.common.content_types import ContentDelta, InterleavedConte
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.models import Model
 from llama_stack.apis.telemetry import MetricResponseMixin
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    StopReason,
@ -913,6 +914,7 @@ class OpenAIEmbeddingData(BaseModel):
    """

    object: Literal["embedding"] = "embedding"
+    # TODO: consider dropping str and using openai.types.embeddings.Embedding instead of OpenAIEmbeddingData
    embedding: list[float] | str
    index: int

@ -973,26 +975,6 @@ class EmbeddingTaskType(Enum):
    document = "document"


-@json_schema_type
-class BatchCompletionResponse(BaseModel):
-    """Response from a batch completion request.
-
-    :param batch: List of completion responses, one for each input in the batch
-    """
-
-    batch: list[CompletionResponse]
-
-
-@json_schema_type
-class BatchChatCompletionResponse(BaseModel):
-    """Response from a batch chat completion request.
-
-    :param batch: List of chat completion responses, one for each conversation in the batch
-    """
-
-    batch: list[ChatCompletionResponse]
-
-
 class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
    input_messages: list[OpenAIMessageParam]

@ -1026,7 +1008,7 @@ class InferenceProvider(Protocol):

    model_store: ModelStore | None = None

-    @webmethod(route="/inference/completion", method="POST")
+    @webmethod(route="/inference/completion", method="POST", level=LLAMA_STACK_API_V1)
    async def completion(
        self,
        model_id: str,
@ -1049,28 +1031,7 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/inference/batch-completion", method="POST", experimental=True)
-    async def batch_completion(
-        self,
-        model_id: str,
-        content_batch: list[InterleavedContent],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> BatchCompletionResponse:
-        """Generate completions for a batch of content using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param content_batch: The content to generate completions for.
-        :param sampling_params: (Optional) Parameters to control the sampling strategy.
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
-        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
-        :returns: A BatchCompletionResponse with the full completions.
-        """
-        raise NotImplementedError("Batch completion is not implemented")
-        return  # this is so mypy's safe-super rule will consider the method concrete
-
-    @webmethod(route="/inference/chat-completion", method="POST")
+    @webmethod(route="/inference/chat-completion", method="POST", level=LLAMA_STACK_API_V1)
    async def chat_completion(
        self,
        model_id: str,
@ -1110,32 +1071,7 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/inference/batch-chat-completion", method="POST", experimental=True)
-    async def batch_chat_completion(
-        self,
-        model_id: str,
-        messages_batch: list[list[Message]],
-        sampling_params: SamplingParams | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_config: ToolConfig | None = None,
-        response_format: ResponseFormat | None = None,
-        logprobs: LogProbConfig | None = None,
-    ) -> BatchChatCompletionResponse:
-        """Generate chat completions for a batch of messages using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param messages_batch: The messages to generate completions for.
-        :param sampling_params: (Optional) Parameters to control the sampling strategy.
-        :param tools: (Optional) List of tool definitions available to the model.
-        :param tool_config: (Optional) Configuration for tool use.
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
-        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
-        :returns: A BatchChatCompletionResponse with the full completions.
-        """
-        raise NotImplementedError("Batch chat completion is not implemented")
-        return  # this is so mypy's safe-super rule will consider the method concrete
-
-    @webmethod(route="/inference/embeddings", method="POST")
+    @webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1)
    async def embeddings(
        self,
        model_id: str,
@ -1155,7 +1091,7 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/inference/rerank", method="POST", experimental=True)
+    @webmethod(route="/inference/rerank", method="POST", experimental=True, level=LLAMA_STACK_API_V1)
    async def rerank(
        self,
        model: str,
@ -1174,7 +1110,7 @@ class InferenceProvider(Protocol):
        raise NotImplementedError("Reranking is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete

-    @webmethod(route="/openai/v1/completions", method="POST")
+    @webmethod(route="/openai/v1/completions", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_completion(
        self,
        # Standard OpenAI completion parameters
@ -1225,7 +1161,7 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/chat/completions", method="POST")
+    @webmethod(route="/openai/v1/chat/completions", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_chat_completion(
        self,
        model: str,
@ -1281,7 +1217,7 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/embeddings", method="POST")
+    @webmethod(route="/openai/v1/embeddings", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_embeddings(
        self,
        model: str,
@ -1310,7 +1246,7 @@ class Inference(InferenceProvider):
    - Embedding models: these models generate embeddings to be used for semantic search.
    """

-    @webmethod(route="/openai/v1/chat/completions", method="GET")
+    @webmethod(route="/openai/v1/chat/completions", method="GET", level=LLAMA_STACK_API_V1)
    async def list_chat_completions(
        self,
        after: str | None = None,
@ -1328,7 +1264,7 @@ class Inference(InferenceProvider):
        """
        raise NotImplementedError("List chat completions is not implemented")

-    @webmethod(route="/openai/v1/chat/completions/{completion_id}", method="GET")
+    @webmethod(route="/openai/v1/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
        """Describe a chat completion by its ID.

--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@ -8,6 +8,7 @@ from typing import Protocol, runtime_checkable

 from pydantic import BaseModel

+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.datatypes import HealthStatus
 from llama_stack.schema_utils import json_schema_type, webmethod

@ -57,7 +58,7 @@ class ListRoutesResponse(BaseModel):

@runtime_checkable
 class Inspect(Protocol):
-    @webmethod(route="/inspect/routes", method="GET")
+    @webmethod(route="/inspect/routes", method="GET", level=LLAMA_STACK_API_V1)
    async def list_routes(self) -> ListRoutesResponse:
        """List all available API routes with their methods and implementing providers.

@ -65,7 +66,7 @@ class Inspect(Protocol):
        """
        ...

-    @webmethod(route="/health", method="GET")
+    @webmethod(route="/health", method="GET", level=LLAMA_STACK_API_V1)
    async def health(self) -> HealthInfo:
        """Get the current health status of the service.

@ -73,7 +74,7 @@ class Inspect(Protocol):
        """
        ...

-    @webmethod(route="/version", method="GET")
+    @webmethod(route="/version", method="GET", level=LLAMA_STACK_API_V1)
    async def version(self) -> VersionInfo:
        """Get the version of the service.

--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -10,6 +10,7 @@ from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, ConfigDict, Field, field_validator

 from llama_stack.apis.resource import Resource, ResourceType
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod

@ -102,7 +103,7 @@ class OpenAIListModelsResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class Models(Protocol):
-    @webmethod(route="/models", method="GET")
+    @webmethod(route="/models", method="GET", level=LLAMA_STACK_API_V1)
    async def list_models(self) -> ListModelsResponse:
        """List all models.

@ -110,7 +111,7 @@ class Models(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/models", method="GET")
+    @webmethod(route="/openai/v1/models", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_models(self) -> OpenAIListModelsResponse:
        """List models using the OpenAI API.

@ -118,7 +119,7 @@ class Models(Protocol):
        """
        ...

-    @webmethod(route="/models/{model_id:path}", method="GET")
+    @webmethod(route="/models/{model_id:path}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_model(
        self,
        model_id: str,
@ -130,7 +131,7 @@ class Models(Protocol):
        """
        ...

-    @webmethod(route="/models", method="POST")
+    @webmethod(route="/models", method="POST", level=LLAMA_STACK_API_V1)
    async def register_model(
        self,
        model_id: str,
@ -150,7 +151,7 @@ class Models(Protocol):
        """
        ...

-    @webmethod(route="/models/{model_id:path}", method="DELETE")
+    @webmethod(route="/models/{model_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def unregister_model(
        self,
        model_id: str,
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -13,6 +13,7 @@ from pydantic import BaseModel, Field
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.job_types import JobStatus
 from llama_stack.apis.common.training_types import Checkpoint
+from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -283,7 +284,8 @@ class PostTrainingJobArtifactsResponse(BaseModel):


 class PostTraining(Protocol):
-    @webmethod(route="/post-training/supervised-fine-tune", method="POST")
+    @webmethod(route="/post-training/supervised-fine-tune", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/post-training/supervised-fine-tune", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def supervised_fine_tune(
        self,
        job_uuid: str,
@ -310,7 +312,8 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/preference-optimize", method="POST")
+    @webmethod(route="/post-training/preference-optimize", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/post-training/preference-optimize", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def preference_optimize(
        self,
        job_uuid: str,
@ -332,7 +335,8 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/jobs", method="GET")
+    @webmethod(route="/post-training/jobs", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/post-training/jobs", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
        """Get all training jobs.

@ -340,7 +344,8 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/job/status", method="GET")
+    @webmethod(route="/post-training/job/status", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/post-training/job/status", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse:
        """Get the status of a training job.

@ -349,7 +354,8 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/job/cancel", method="POST")
+    @webmethod(route="/post-training/job/cancel", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/post-training/job/cancel", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def cancel_training_job(self, job_uuid: str) -> None:
        """Cancel a training job.

@ -357,7 +363,8 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/job/artifacts", method="GET")
+    @webmethod(route="/post-training/job/artifacts", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
+    @webmethod(route="/post-training/job/artifacts", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
        """Get the artifacts of a training job.

--- a/llama_stack/apis/prompts/prompts.py
+++ b/llama_stack/apis/prompts/prompts.py
@ -10,6 +10,7 @@ from typing import Protocol, runtime_checkable

 from pydantic import BaseModel, Field, field_validator, model_validator

+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod

@ -95,7 +96,7 @@ class ListPromptsResponse(BaseModel):
 class Prompts(Protocol):
    """Protocol for prompt management operations."""

-    @webmethod(route="/prompts", method="GET")
+    @webmethod(route="/prompts", method="GET", level=LLAMA_STACK_API_V1)
    async def list_prompts(self) -> ListPromptsResponse:
        """List all prompts.

@ -103,7 +104,7 @@ class Prompts(Protocol):
        """
        ...

-    @webmethod(route="/prompts/{prompt_id}/versions", method="GET")
+    @webmethod(route="/prompts/{prompt_id}/versions", method="GET", level=LLAMA_STACK_API_V1)
    async def list_prompt_versions(
        self,
        prompt_id: str,
@ -115,7 +116,7 @@ class Prompts(Protocol):
        """
        ...

-    @webmethod(route="/prompts/{prompt_id}", method="GET")
+    @webmethod(route="/prompts/{prompt_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_prompt(
        self,
        prompt_id: str,
@ -129,7 +130,7 @@ class Prompts(Protocol):
        """
        ...

-    @webmethod(route="/prompts", method="POST")
+    @webmethod(route="/prompts", method="POST", level=LLAMA_STACK_API_V1)
    async def create_prompt(
        self,
        prompt: str,
@ -143,7 +144,7 @@ class Prompts(Protocol):
        """
        ...

-    @webmethod(route="/prompts/{prompt_id}", method="PUT")
+    @webmethod(route="/prompts/{prompt_id}", method="PUT", level=LLAMA_STACK_API_V1)
    async def update_prompt(
        self,
        prompt_id: str,
@ -163,7 +164,7 @@ class Prompts(Protocol):
        """
        ...

-    @webmethod(route="/prompts/{prompt_id}", method="DELETE")
+    @webmethod(route="/prompts/{prompt_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def delete_prompt(
        self,
        prompt_id: str,
@ -174,7 +175,7 @@ class Prompts(Protocol):
        """
        ...

-    @webmethod(route="/prompts/{prompt_id}/set-default-version", method="PUT")
+    @webmethod(route="/prompts/{prompt_id}/set-default-version", method="PUT", level=LLAMA_STACK_API_V1)
    async def set_default_version(
        self,
        prompt_id: str,
--- a/llama_stack/apis/providers/providers.py
+++ b/llama_stack/apis/providers/providers.py
@ -8,6 +8,7 @@ from typing import Any, Protocol, runtime_checkable

 from pydantic import BaseModel

+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.datatypes import HealthResponse
 from llama_stack.schema_utils import json_schema_type, webmethod

@ -45,7 +46,7 @@ class Providers(Protocol):
    Providers API for inspecting, listing, and modifying providers and their configurations.
    """

-    @webmethod(route="/providers", method="GET")
+    @webmethod(route="/providers", method="GET", level=LLAMA_STACK_API_V1)
    async def list_providers(self) -> ListProvidersResponse:
        """List all available providers.

@ -53,7 +54,7 @@ class Providers(Protocol):
        """
        ...

-    @webmethod(route="/providers/{provider_id}", method="GET")
+    @webmethod(route="/providers/{provider_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def inspect_provider(self, provider_id: str) -> ProviderInfo:
        """Get detailed information about a specific provider.

--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@ -11,6 +11,7 @@ from pydantic import BaseModel, Field

 from llama_stack.apis.inference import Message
 from llama_stack.apis.shields import Shield
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod

@ -97,7 +98,7 @@ class ShieldStore(Protocol):
 class Safety(Protocol):
    shield_store: ShieldStore

-    @webmethod(route="/safety/run-shield", method="POST")
+    @webmethod(route="/safety/run-shield", method="POST", level=LLAMA_STACK_API_V1)
    async def run_shield(
        self,
        shield_id: str,
@ -113,7 +114,7 @@ class Safety(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/moderations", method="POST")
+    @webmethod(route="/openai/v1/moderations", method="POST", level=LLAMA_STACK_API_V1)
    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
        """Classifies if text and/or image inputs are potentially harmful.
        :param input: Input (or inputs) to classify.
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@ -9,6 +9,7 @@ from typing import Any, Protocol, runtime_checkable
 from pydantic import BaseModel

 from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.schema_utils import json_schema_type, webmethod

 # mapping of metric to value
@ -61,7 +62,7 @@ class ScoringFunctionStore(Protocol):
 class Scoring(Protocol):
    scoring_function_store: ScoringFunctionStore

-    @webmethod(route="/scoring/score-batch", method="POST")
+    @webmethod(route="/scoring/score-batch", method="POST", level=LLAMA_STACK_API_V1)
    async def score_batch(
        self,
        dataset_id: str,
@ -77,7 +78,7 @@ class Scoring(Protocol):
        """
        ...

-    @webmethod(route="/scoring/score", method="POST")
+    @webmethod(route="/scoring/score", method="POST", level=LLAMA_STACK_API_V1)
    async def score(
        self,
        input_rows: list[dict[str, Any]],
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -18,6 +18,7 @@ from pydantic import BaseModel, Field

 from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.resource import Resource, ResourceType
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -160,7 +161,7 @@ class ListScoringFunctionsResponse(BaseModel):

@runtime_checkable
 class ScoringFunctions(Protocol):
-    @webmethod(route="/scoring-functions", method="GET")
+    @webmethod(route="/scoring-functions", method="GET", level=LLAMA_STACK_API_V1)
    async def list_scoring_functions(self) -> ListScoringFunctionsResponse:
        """List all scoring functions.

@ -168,7 +169,7 @@ class ScoringFunctions(Protocol):
        """
        ...

-    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET")
+    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn:
        """Get a scoring function by its ID.

@ -177,7 +178,7 @@ class ScoringFunctions(Protocol):
        """
        ...

-    @webmethod(route="/scoring-functions", method="POST")
+    @webmethod(route="/scoring-functions", method="POST", level=LLAMA_STACK_API_V1)
    async def register_scoring_function(
        self,
        scoring_fn_id: str,
@ -198,7 +199,7 @@ class ScoringFunctions(Protocol):
        """
        ...

-    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE")
+    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
        """Unregister a scoring function.

--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@ -9,6 +9,7 @@ from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel

 from llama_stack.apis.resource import Resource, ResourceType
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod

@ -49,7 +50,7 @@ class ListShieldsResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class Shields(Protocol):
-    @webmethod(route="/shields", method="GET")
+    @webmethod(route="/shields", method="GET", level=LLAMA_STACK_API_V1)
    async def list_shields(self) -> ListShieldsResponse:
        """List all shields.

@ -57,7 +58,7 @@ class Shields(Protocol):
        """
        ...

-    @webmethod(route="/shields/{identifier:path}", method="GET")
+    @webmethod(route="/shields/{identifier:path}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_shield(self, identifier: str) -> Shield:
        """Get a shield by its identifier.

@ -66,7 +67,7 @@ class Shields(Protocol):
        """
        ...

-    @webmethod(route="/shields", method="POST")
+    @webmethod(route="/shields", method="POST", level=LLAMA_STACK_API_V1)
    async def register_shield(
        self,
        shield_id: str,
@ -84,7 +85,7 @@ class Shields(Protocol):
        """
        ...

-    @webmethod(route="/shields/{identifier:path}", method="DELETE")
+    @webmethod(route="/shields/{identifier:path}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def unregister_shield(self, identifier: str) -> None:
        """Unregister a shield.

--- a/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
+++ b/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
@ -10,6 +10,7 @@ from typing import Any, Protocol
 from pydantic import BaseModel

 from llama_stack.apis.inference import Message
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -59,7 +60,7 @@ class SyntheticDataGenerationResponse(BaseModel):


 class SyntheticDataGeneration(Protocol):
-    @webmethod(route="/synthetic-data-generation/generate")
+    @webmethod(route="/synthetic-data-generation/generate", level=LLAMA_STACK_API_V1)
    def synthetic_data_generate(
        self,
        dialogs: list[Message],
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@ -16,6 +16,7 @@ from typing import (

 from pydantic import BaseModel, Field

+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.models.llama.datatypes import Primitive
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

@ -412,7 +413,7 @@ class QueryMetricsResponse(BaseModel):

@runtime_checkable
 class Telemetry(Protocol):
-    @webmethod(route="/telemetry/events", method="POST")
+    @webmethod(route="/telemetry/events", method="POST", level=LLAMA_STACK_API_V1)
    async def log_event(
        self,
        event: Event,
@ -425,7 +426,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/traces", method="POST", required_scope=REQUIRED_SCOPE)
+    @webmethod(route="/telemetry/traces", method="POST", required_scope=REQUIRED_SCOPE, level=LLAMA_STACK_API_V1)
    async def query_traces(
        self,
        attribute_filters: list[QueryCondition] | None = None,
@ -443,7 +444,9 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET", required_scope=REQUIRED_SCOPE)
+    @webmethod(
+        route="/telemetry/traces/{trace_id:path}", method="GET", required_scope=REQUIRED_SCOPE, level=LLAMA_STACK_API_V1
+    )
    async def get_trace(self, trace_id: str) -> Trace:
        """Get a trace by its ID.

@ -453,7 +456,10 @@ class Telemetry(Protocol):
        ...

    @webmethod(
-        route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET", required_scope=REQUIRED_SCOPE
+        route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}",
+        method="GET",
+        required_scope=REQUIRED_SCOPE,
+        level=LLAMA_STACK_API_V1,
    )
    async def get_span(self, trace_id: str, span_id: str) -> Span:
        """Get a span by its ID.
@ -464,7 +470,12 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="POST", required_scope=REQUIRED_SCOPE)
+    @webmethod(
+        route="/telemetry/spans/{span_id:path}/tree",
+        method="POST",
+        required_scope=REQUIRED_SCOPE,
+        level=LLAMA_STACK_API_V1,
+    )
    async def get_span_tree(
        self,
        span_id: str,
@ -480,7 +491,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/spans", method="POST", required_scope=REQUIRED_SCOPE)
+    @webmethod(route="/telemetry/spans", method="POST", required_scope=REQUIRED_SCOPE, level=LLAMA_STACK_API_V1)
    async def query_spans(
        self,
        attribute_filters: list[QueryCondition],
@ -496,7 +507,7 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/spans/export", method="POST")
+    @webmethod(route="/telemetry/spans/export", method="POST", level=LLAMA_STACK_API_V1)
    async def save_spans_to_dataset(
        self,
        attribute_filters: list[QueryCondition],
@ -513,7 +524,9 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/metrics/{metric_name}", method="POST", required_scope=REQUIRED_SCOPE)
+    @webmethod(
+        route="/telemetry/metrics/{metric_name}", method="POST", required_scope=REQUIRED_SCOPE, level=LLAMA_STACK_API_V1
+    )
    async def query_metrics(
        self,
        metric_name: str,
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@ -11,6 +11,7 @@ from pydantic import BaseModel, Field, field_validator
 from typing_extensions import runtime_checkable

 from llama_stack.apis.common.content_types import URL, InterleavedContent
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

@ -185,7 +186,7 @@ class RAGQueryConfig(BaseModel):
@runtime_checkable
@trace_protocol
 class RAGToolRuntime(Protocol):
-    @webmethod(route="/tool-runtime/rag-tool/insert", method="POST")
+    @webmethod(route="/tool-runtime/rag-tool/insert", method="POST", level=LLAMA_STACK_API_V1)
    async def insert(
        self,
        documents: list[RAGDocument],
@ -200,7 +201,7 @@ class RAGToolRuntime(Protocol):
        """
        ...

-    @webmethod(route="/tool-runtime/rag-tool/query", method="POST")
+    @webmethod(route="/tool-runtime/rag-tool/query", method="POST", level=LLAMA_STACK_API_V1)
    async def query(
        self,
        content: InterleavedContent,
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@ -12,6 +12,7 @@ from typing_extensions import runtime_checkable

 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.apis.resource import Resource, ResourceType
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod

@ -155,7 +156,7 @@ class ListToolDefsResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class ToolGroups(Protocol):
-    @webmethod(route="/toolgroups", method="POST")
+    @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1)
    async def register_tool_group(
        self,
        toolgroup_id: str,
@ -172,7 +173,7 @@ class ToolGroups(Protocol):
        """
        ...

-    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET")
+    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_tool_group(
        self,
        toolgroup_id: str,
@ -184,7 +185,7 @@ class ToolGroups(Protocol):
        """
        ...

-    @webmethod(route="/toolgroups", method="GET")
+    @webmethod(route="/toolgroups", method="GET", level=LLAMA_STACK_API_V1)
    async def list_tool_groups(self) -> ListToolGroupsResponse:
        """List tool groups with optional provider.

@ -192,7 +193,7 @@ class ToolGroups(Protocol):
        """
        ...

-    @webmethod(route="/tools", method="GET")
+    @webmethod(route="/tools", method="GET", level=LLAMA_STACK_API_V1)
    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolsResponse:
        """List tools with optional tool group.

@ -201,7 +202,7 @@ class ToolGroups(Protocol):
        """
        ...

-    @webmethod(route="/tools/{tool_name:path}", method="GET")
+    @webmethod(route="/tools/{tool_name:path}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_tool(
        self,
        tool_name: str,
@ -213,7 +214,7 @@ class ToolGroups(Protocol):
        """
        ...

-    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE")
+    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def unregister_toolgroup(
        self,
        toolgroup_id: str,
@ -242,7 +243,7 @@ class ToolRuntime(Protocol):
    rag_tool: RAGToolRuntime | None = None

    # TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed.
-    @webmethod(route="/tool-runtime/list-tools", method="GET")
+    @webmethod(route="/tool-runtime/list-tools", method="GET", level=LLAMA_STACK_API_V1)
    async def list_runtime_tools(
        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
    ) -> ListToolDefsResponse:
@ -254,7 +255,7 @@ class ToolRuntime(Protocol):
        """
        ...

-    @webmethod(route="/tool-runtime/invoke", method="POST")
+    @webmethod(route="/tool-runtime/invoke", method="POST", level=LLAMA_STACK_API_V1)
    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
        """Run a tool with the given arguments.

--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@ -9,6 +9,7 @@ from typing import Literal, Protocol, runtime_checkable
 from pydantic import BaseModel

 from llama_stack.apis.resource import Resource, ResourceType
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod

@ -65,7 +66,7 @@ class ListVectorDBsResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class VectorDBs(Protocol):
-    @webmethod(route="/vector-dbs", method="GET")
+    @webmethod(route="/vector-dbs", method="GET", level=LLAMA_STACK_API_V1)
    async def list_vector_dbs(self) -> ListVectorDBsResponse:
        """List all vector databases.

@ -73,7 +74,7 @@ class VectorDBs(Protocol):
        """
        ...

-    @webmethod(route="/vector-dbs/{vector_db_id:path}", method="GET")
+    @webmethod(route="/vector-dbs/{vector_db_id:path}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_vector_db(
        self,
        vector_db_id: str,
@ -85,7 +86,7 @@ class VectorDBs(Protocol):
        """
        ...

-    @webmethod(route="/vector-dbs", method="POST")
+    @webmethod(route="/vector-dbs", method="POST", level=LLAMA_STACK_API_V1)
    async def register_vector_db(
        self,
        vector_db_id: str,
@ -107,7 +108,7 @@ class VectorDBs(Protocol):
        """
        ...

-    @webmethod(route="/vector-dbs/{vector_db_id:path}", method="DELETE")
+    @webmethod(route="/vector-dbs/{vector_db_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def unregister_vector_db(self, vector_db_id: str) -> None:
        """Unregister a vector database.

--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -15,6 +15,7 @@ from pydantic import BaseModel, Field

 from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_dbs import VectorDB
+from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id
 from llama_stack.schema_utils import json_schema_type, webmethod
@ -437,7 +438,7 @@ class VectorIO(Protocol):

    # this will just block now until chunks are inserted, but it should
    # probably return a Job instance which can be polled for completion
-    @webmethod(route="/vector-io/insert", method="POST")
+    @webmethod(route="/vector-io/insert", method="POST", level=LLAMA_STACK_API_V1)
    async def insert_chunks(
        self,
        vector_db_id: str,
@ -455,7 +456,7 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/vector-io/query", method="POST")
+    @webmethod(route="/vector-io/query", method="POST", level=LLAMA_STACK_API_V1)
    async def query_chunks(
        self,
        vector_db_id: str,
@ -472,7 +473,7 @@ class VectorIO(Protocol):
        ...

    # OpenAI Vector Stores API endpoints
-    @webmethod(route="/openai/v1/vector_stores", method="POST")
+    @webmethod(route="/openai/v1/vector_stores", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_create_vector_store(
        self,
        name: str | None = None,
@ -498,7 +499,7 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores", method="GET")
+    @webmethod(route="/openai/v1/vector_stores", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_vector_stores(
        self,
        limit: int | None = 20,
@ -516,7 +517,7 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}", method="GET")
+    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_vector_store(
        self,
        vector_store_id: str,
@ -528,7 +529,7 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}", method="POST")
+    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_update_vector_store(
        self,
        vector_store_id: str,
@ -546,7 +547,7 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}", method="DELETE")
+    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def openai_delete_vector_store(
        self,
        vector_store_id: str,
@ -558,7 +559,7 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/search", method="POST")
+    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/search", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_search_vector_store(
        self,
        vector_store_id: str,
@ -584,7 +585,7 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files", method="POST")
+    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_attach_file_to_vector_store(
        self,
        vector_store_id: str,
@ -602,7 +603,7 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files", method="GET")
+    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_files_in_vector_store(
        self,
        vector_store_id: str,
@ -624,7 +625,9 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", method="GET")
+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", method="GET", level=LLAMA_STACK_API_V1
+    )
    async def openai_retrieve_vector_store_file(
        self,
        vector_store_id: str,
@ -638,7 +641,11 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content", method="GET")
+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
+        method="GET",
+        level=LLAMA_STACK_API_V1,
+    )
    async def openai_retrieve_vector_store_file_contents(
        self,
        vector_store_id: str,
@ -652,7 +659,9 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", method="POST")
+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", method="POST", level=LLAMA_STACK_API_V1
+    )
    async def openai_update_vector_store_file(
        self,
        vector_store_id: str,
@ -668,7 +677,9 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", method="DELETE")
+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", method="DELETE", level=LLAMA_STACK_API_V1
+    )
    async def openai_delete_vector_store_file(
        self,
        vector_store_id: str,
--- a/llama_stack/apis/version.py
+++ b/llama_stack/apis/version.py
@ -4,4 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-LLAMA_STACK_API_VERSION = "v1"
+LLAMA_STACK_API_V1 = "v1"
+LLAMA_STACK_API_V1BETA = "v1beta"
+LLAMA_STACK_API_V1ALPHA = "v1alpha"