Merge branch 'main' into feat/gunicorn-production-server

2025-12-03 09:53:45 +00:00 · 2025-11-04 15:57:41 +02:00 · 2025-11-04 15:57:41 +02:00 · 9ff881a28a
commit 9ff881a28a
parent b728307427 a6ddbae0ed
17 changed files with 439 additions and 11392 deletions
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -977,11 +977,11 @@ paths:
    get:
      responses:
        '200':
-          description: A ListModelsResponse.
+          description: A OpenAIListModelsResponse.
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/ListModelsResponse'
+                $ref: '#/components/schemas/OpenAIListModelsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -994,8 +994,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Models
-      summary: List all models.
-      description: List all models.
+      summary: List models using the OpenAI API.
+      description: List models using the OpenAI API.
      parameters: []
      deprecated: false
    post:
@ -1129,31 +1129,6 @@ paths:
              $ref: '#/components/schemas/RunModerationRequest'
        required: true
      deprecated: false
-  /v1/openai/v1/models:
-    get:
-      responses:
-        '200':
-          description: A OpenAIListModelsResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/OpenAIListModelsResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Models
-      summary: List models using the OpenAI API.
-      description: List models using the OpenAI API.
-      parameters: []
-      deprecated: false
  /v1/prompts:
    get:
      responses:
@ -6823,6 +6798,88 @@ components:
      title: ListRoutesResponse
      description: >-
        Response containing a list of all available API routes.
+    OpenAIModel:
+      type: object
+      properties:
+        id:
+          type: string
+        object:
+          type: string
+          const: model
+          default: model
+        created:
+          type: integer
+        owned_by:
+          type: string
+        custom_metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - id
+        - object
+        - created
+        - owned_by
+      title: OpenAIModel
+      description: A model from OpenAI.
+    OpenAIListModelsResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIModel'
+      additionalProperties: false
+      required:
+        - data
+      title: OpenAIListModelsResponse
+    ModelType:
+      type: string
+      enum:
+        - llm
+        - embedding
+        - rerank
+      title: ModelType
+      description: >-
+        Enumeration of supported model types in Llama Stack.
+    RegisterModelRequest:
+      type: object
+      properties:
+        model_id:
+          type: string
+          description: The identifier of the model to register.
+        provider_model_id:
+          type: string
+          description: >-
+            The identifier of the model in the provider.
+        provider_id:
+          type: string
+          description: The identifier of the provider.
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: Any additional metadata for this model.
+        model_type:
+          $ref: '#/components/schemas/ModelType'
+          description: The type of model to register.
+      additionalProperties: false
+      required:
+        - model_id
+      title: RegisterModelRequest
    Model:
      type: object
      properties:
@ -6880,57 +6937,6 @@ components:
      title: Model
      description: >-
        A model resource representing an AI model registered in Llama Stack.
-    ModelType:
-      type: string
-      enum:
-        - llm
-        - embedding
-        - rerank
-      title: ModelType
-      description: >-
-        Enumeration of supported model types in Llama Stack.
-    ListModelsResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/Model'
-      additionalProperties: false
-      required:
-        - data
-      title: ListModelsResponse
-    RegisterModelRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: The identifier of the model to register.
-        provider_model_id:
-          type: string
-          description: >-
-            The identifier of the model in the provider.
-        provider_id:
-          type: string
-          description: The identifier of the provider.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Any additional metadata for this model.
-        model_type:
-          $ref: '#/components/schemas/ModelType'
-          description: The type of model to register.
-      additionalProperties: false
-      required:
-        - model_id
-      title: RegisterModelRequest
    RunModerationRequest:
      type: object
      properties:
@ -7020,48 +7026,6 @@ components:
        - metadata
      title: ModerationObjectResults
      description: A moderation object.
-    OpenAIModel:
-      type: object
-      properties:
-        id:
-          type: string
-        object:
-          type: string
-          const: model
-          default: model
-        created:
-          type: integer
-        owned_by:
-          type: string
-        custom_metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - id
-        - object
-        - created
-        - owned_by
-      title: OpenAIModel
-      description: A model from OpenAI.
-    OpenAIListModelsResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/OpenAIModel'
-      additionalProperties: false
-      required:
-        - data
-      title: OpenAIListModelsResponse
    Prompt:
      type: object
      properties:
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -974,11 +974,11 @@ paths:
    get:
      responses:
        '200':
-          description: A ListModelsResponse.
+          description: A OpenAIListModelsResponse.
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/ListModelsResponse'
+                $ref: '#/components/schemas/OpenAIListModelsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -991,8 +991,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Models
-      summary: List all models.
-      description: List all models.
+      summary: List models using the OpenAI API.
+      description: List models using the OpenAI API.
      parameters: []
      deprecated: false
    post:
@ -1126,31 +1126,6 @@ paths:
              $ref: '#/components/schemas/RunModerationRequest'
        required: true
      deprecated: false
-  /v1/openai/v1/models:
-    get:
-      responses:
-        '200':
-          description: A OpenAIListModelsResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/OpenAIListModelsResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Models
-      summary: List models using the OpenAI API.
-      description: List models using the OpenAI API.
-      parameters: []
-      deprecated: false
  /v1/prompts:
    get:
      responses:
@ -5610,6 +5585,88 @@ components:
      title: ListRoutesResponse
      description: >-
        Response containing a list of all available API routes.
+    OpenAIModel:
+      type: object
+      properties:
+        id:
+          type: string
+        object:
+          type: string
+          const: model
+          default: model
+        created:
+          type: integer
+        owned_by:
+          type: string
+        custom_metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - id
+        - object
+        - created
+        - owned_by
+      title: OpenAIModel
+      description: A model from OpenAI.
+    OpenAIListModelsResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIModel'
+      additionalProperties: false
+      required:
+        - data
+      title: OpenAIListModelsResponse
+    ModelType:
+      type: string
+      enum:
+        - llm
+        - embedding
+        - rerank
+      title: ModelType
+      description: >-
+        Enumeration of supported model types in Llama Stack.
+    RegisterModelRequest:
+      type: object
+      properties:
+        model_id:
+          type: string
+          description: The identifier of the model to register.
+        provider_model_id:
+          type: string
+          description: >-
+            The identifier of the model in the provider.
+        provider_id:
+          type: string
+          description: The identifier of the provider.
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: Any additional metadata for this model.
+        model_type:
+          $ref: '#/components/schemas/ModelType'
+          description: The type of model to register.
+      additionalProperties: false
+      required:
+        - model_id
+      title: RegisterModelRequest
    Model:
      type: object
      properties:
@ -5667,57 +5724,6 @@ components:
      title: Model
      description: >-
        A model resource representing an AI model registered in Llama Stack.
-    ModelType:
-      type: string
-      enum:
-        - llm
-        - embedding
-        - rerank
-      title: ModelType
-      description: >-
-        Enumeration of supported model types in Llama Stack.
-    ListModelsResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/Model'
-      additionalProperties: false
-      required:
-        - data
-      title: ListModelsResponse
-    RegisterModelRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: The identifier of the model to register.
-        provider_model_id:
-          type: string
-          description: >-
-            The identifier of the model in the provider.
-        provider_id:
-          type: string
-          description: The identifier of the provider.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Any additional metadata for this model.
-        model_type:
-          $ref: '#/components/schemas/ModelType'
-          description: The type of model to register.
-      additionalProperties: false
-      required:
-        - model_id
-      title: RegisterModelRequest
    RunModerationRequest:
      type: object
      properties:
@ -5807,48 +5813,6 @@ components:
        - metadata
      title: ModerationObjectResults
      description: A moderation object.
-    OpenAIModel:
-      type: object
-      properties:
-        id:
-          type: string
-        object:
-          type: string
-          const: model
-          default: model
-        created:
-          type: integer
-        owned_by:
-          type: string
-        custom_metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - id
-        - object
-        - created
-        - owned_by
-      title: OpenAIModel
-      description: A model from OpenAI.
-    OpenAIListModelsResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/OpenAIModel'
-      additionalProperties: false
-      required:
-        - data
-      title: OpenAIListModelsResponse
    Prompt:
      type: object
      properties:
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -977,11 +977,11 @@ paths:
    get:
      responses:
        '200':
-          description: A ListModelsResponse.
+          description: A OpenAIListModelsResponse.
          content:
            application/json:
              schema:
-                $ref: '#/components/schemas/ListModelsResponse'
+                $ref: '#/components/schemas/OpenAIListModelsResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
@ -994,8 +994,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Models
-      summary: List all models.
-      description: List all models.
+      summary: List models using the OpenAI API.
+      description: List models using the OpenAI API.
      parameters: []
      deprecated: false
    post:
@ -1129,31 +1129,6 @@ paths:
              $ref: '#/components/schemas/RunModerationRequest'
        required: true
      deprecated: false
-  /v1/openai/v1/models:
-    get:
-      responses:
-        '200':
-          description: A OpenAIListModelsResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/OpenAIListModelsResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Models
-      summary: List models using the OpenAI API.
-      description: List models using the OpenAI API.
-      parameters: []
-      deprecated: false
  /v1/prompts:
    get:
      responses:
@ -6823,6 +6798,88 @@ components:
      title: ListRoutesResponse
      description: >-
        Response containing a list of all available API routes.
+    OpenAIModel:
+      type: object
+      properties:
+        id:
+          type: string
+        object:
+          type: string
+          const: model
+          default: model
+        created:
+          type: integer
+        owned_by:
+          type: string
+        custom_metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - id
+        - object
+        - created
+        - owned_by
+      title: OpenAIModel
+      description: A model from OpenAI.
+    OpenAIListModelsResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIModel'
+      additionalProperties: false
+      required:
+        - data
+      title: OpenAIListModelsResponse
+    ModelType:
+      type: string
+      enum:
+        - llm
+        - embedding
+        - rerank
+      title: ModelType
+      description: >-
+        Enumeration of supported model types in Llama Stack.
+    RegisterModelRequest:
+      type: object
+      properties:
+        model_id:
+          type: string
+          description: The identifier of the model to register.
+        provider_model_id:
+          type: string
+          description: >-
+            The identifier of the model in the provider.
+        provider_id:
+          type: string
+          description: The identifier of the provider.
+        metadata:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+          description: Any additional metadata for this model.
+        model_type:
+          $ref: '#/components/schemas/ModelType'
+          description: The type of model to register.
+      additionalProperties: false
+      required:
+        - model_id
+      title: RegisterModelRequest
    Model:
      type: object
      properties:
@ -6880,57 +6937,6 @@ components:
      title: Model
      description: >-
        A model resource representing an AI model registered in Llama Stack.
-    ModelType:
-      type: string
-      enum:
-        - llm
-        - embedding
-        - rerank
-      title: ModelType
-      description: >-
-        Enumeration of supported model types in Llama Stack.
-    ListModelsResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/Model'
-      additionalProperties: false
-      required:
-        - data
-      title: ListModelsResponse
-    RegisterModelRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: The identifier of the model to register.
-        provider_model_id:
-          type: string
-          description: >-
-            The identifier of the model in the provider.
-        provider_id:
-          type: string
-          description: The identifier of the provider.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Any additional metadata for this model.
-        model_type:
-          $ref: '#/components/schemas/ModelType'
-          description: The type of model to register.
-      additionalProperties: false
-      required:
-        - model_id
-      title: RegisterModelRequest
    RunModerationRequest:
      type: object
      properties:
@ -7020,48 +7026,6 @@ components:
        - metadata
      title: ModerationObjectResults
      description: A moderation object.
-    OpenAIModel:
-      type: object
-      properties:
-        id:
-          type: string
-        object:
-          type: string
-          const: model
-          default: model
-        created:
-          type: integer
-        owned_by:
-          type: string
-        custom_metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-      additionalProperties: false
-      required:
-        - id
-        - object
-        - created
-        - owned_by
-      title: OpenAIModel
-      description: A model from OpenAI.
-    OpenAIListModelsResponse:
-      type: object
-      properties:
-        data:
-          type: array
-          items:
-            $ref: '#/components/schemas/OpenAIModel'
-      additionalProperties: false
-      required:
-        - data
-      title: OpenAIListModelsResponse
    Prompt:
      type: object
      properties:
--- a/src/llama_stack/apis/agents/agents.py
+++ b/src/llama_stack/apis/agents/agents.py
@ -491,13 +491,6 @@ class Agents(Protocol):

    APIs for creating and interacting with agentic systems."""

-    @webmethod(
-        route="/agents",
-        method="POST",
-        descriptive_name="create_agent",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents",
        method="POST",
@ -515,13 +508,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn",
-        method="POST",
-        descriptive_name="create_agent_turn",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn",
        method="POST",
@ -552,13 +538,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
-        method="POST",
-        descriptive_name="resume_agent_turn",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
        method="POST",
@ -586,12 +565,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}",
-        method="GET",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}",
        method="GET",
@ -612,12 +585,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}",
-        method="GET",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}",
        method="GET",
@ -640,13 +607,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session",
-        method="POST",
-        descriptive_name="create_agent_session",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session",
        method="POST",
@ -666,12 +626,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}",
-        method="GET",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}",
        method="GET",
@ -692,12 +646,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}",
-        method="DELETE",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(
        route="/agents/{agent_id}/session/{session_id}",
        method="DELETE",
@ -715,12 +663,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}",
-        method="DELETE",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(route="/agents/{agent_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
    async def delete_agent(
        self,
@ -732,7 +674,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/agents", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def list_agents(self, start_index: int | None = None, limit: int | None = None) -> PaginatedResponse:
        """List all agents.
@ -743,12 +684,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}",
-        method="GET",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(route="/agents/{agent_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_agent(self, agent_id: str) -> Agent:
        """Describe an agent by its ID.
@ -758,12 +693,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/agents/{agent_id}/sessions",
-        method="GET",
-        deprecated=True,
-        level=LLAMA_STACK_API_V1,
-    )
    @webmethod(route="/agents/{agent_id}/sessions", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def list_agent_sessions(
        self,
@ -787,12 +716,6 @@ class Agents(Protocol):
    #
    # Both of these APIs are inherently stateful.

-    @webmethod(
-        route="/openai/v1/responses/{response_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(route="/responses/{response_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_openai_response(
        self,
@ -805,7 +728,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/responses", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses", method="POST", level=LLAMA_STACK_API_V1)
    async def create_openai_response(
        self,
@ -842,7 +764,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/responses", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses", method="GET", level=LLAMA_STACK_API_V1)
    async def list_openai_responses(
        self,
@ -861,9 +782,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/responses/{response_id}/input_items", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(route="/responses/{response_id}/input_items", method="GET", level=LLAMA_STACK_API_V1)
    async def list_openai_response_input_items(
        self,
@ -886,7 +804,6 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
        """Delete a response.
--- a/src/llama_stack/apis/batches/batches.py
+++ b/src/llama_stack/apis/batches/batches.py
@ -43,7 +43,6 @@ class Batches(Protocol):
    Note: This API is currently under active development and may undergo changes.
    """

-    @webmethod(route="/openai/v1/batches", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches", method="POST", level=LLAMA_STACK_API_V1)
    async def create_batch(
        self,
@ -64,7 +63,6 @@ class Batches(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/batches/{batch_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches/{batch_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def retrieve_batch(self, batch_id: str) -> BatchObject:
        """Retrieve information about a specific batch.
@ -74,7 +72,6 @@ class Batches(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches/{batch_id}/cancel", method="POST", level=LLAMA_STACK_API_V1)
    async def cancel_batch(self, batch_id: str) -> BatchObject:
        """Cancel a batch that is in progress.
@ -84,7 +81,6 @@ class Batches(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/batches", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches", method="GET", level=LLAMA_STACK_API_V1)
    async def list_batches(
        self,
--- a/src/llama_stack/apis/benchmarks/benchmarks.py
+++ b/src/llama_stack/apis/benchmarks/benchmarks.py
@ -8,7 +8,7 @@ from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, Field

 from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
+from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -54,7 +54,6 @@ class ListBenchmarksResponse(BaseModel):

@runtime_checkable
 class Benchmarks(Protocol):
-    @webmethod(route="/eval/benchmarks", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def list_benchmarks(self) -> ListBenchmarksResponse:
        """List all benchmarks.
@ -63,7 +62,6 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_benchmark(
        self,
@ -76,7 +74,6 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def register_benchmark(
        self,
@ -98,7 +95,6 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
    async def unregister_benchmark(self, benchmark_id: str) -> None:
        """Unregister a benchmark.
--- a/src/llama_stack/apis/datasetio/datasetio.py
+++ b/src/llama_stack/apis/datasetio/datasetio.py
@ -8,7 +8,7 @@ from typing import Any, Protocol, runtime_checkable

 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasets import Dataset
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1BETA
+from llama_stack.apis.version import LLAMA_STACK_API_V1BETA
 from llama_stack.schema_utils import webmethod


@ -21,7 +21,6 @@ class DatasetIO(Protocol):
    # keeping for aligning with inference/safety, but this is not used
    dataset_store: DatasetStore

-    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1BETA)
    async def iterrows(
        self,
@ -46,9 +45,6 @@ class DatasetIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/datasetio/append-rows/{dataset_id:path}", method="POST", deprecated=True, level=LLAMA_STACK_API_V1
-    )
    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST", level=LLAMA_STACK_API_V1BETA)
    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
        """Append rows to a dataset.
--- a/src/llama_stack/apis/datasets/datasets.py
+++ b/src/llama_stack/apis/datasets/datasets.py
@ -10,7 +10,7 @@ from typing import Annotated, Any, Literal, Protocol
 from pydantic import BaseModel, Field

 from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1BETA
+from llama_stack.apis.version import LLAMA_STACK_API_V1BETA
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -146,7 +146,6 @@ class ListDatasetsResponse(BaseModel):


 class Datasets(Protocol):
-    @webmethod(route="/datasets", method="POST", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1BETA)
    async def register_dataset(
        self,
@ -216,7 +215,6 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets/{dataset_id:path}", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/datasets/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1BETA)
    async def get_dataset(
        self,
@ -229,7 +227,6 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/datasets", method="GET", level=LLAMA_STACK_API_V1BETA)
    async def list_datasets(self) -> ListDatasetsResponse:
        """List all datasets.
@ -238,7 +235,6 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", deprecated=True, level=LLAMA_STACK_API_V1)
    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1BETA)
    async def unregister_dataset(
        self,
--- a/src/llama_stack/apis/eval/eval.py
+++ b/src/llama_stack/apis/eval/eval.py
@ -13,7 +13,7 @@ from llama_stack.apis.common.job_types import Job
 from llama_stack.apis.inference import SamplingParams, SystemMessage
 from llama_stack.apis.scoring import ScoringResult
 from llama_stack.apis.scoring_functions import ScoringFnParams
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
+from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -86,7 +86,6 @@ class Eval(Protocol):

    Llama Stack Evaluation API for running evaluations on model and agent candidates."""

-    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def run_eval(
        self,
@ -101,9 +100,6 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(
-        route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def evaluate_rows(
        self,
@ -122,9 +118,6 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(
-        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def job_status(self, benchmark_id: str, job_id: str) -> Job:
        """Get the status of a job.
@ -135,12 +128,6 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(
-        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
-        method="DELETE",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
    async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
        """Cancel a job.
@ -150,12 +137,6 @@ class Eval(Protocol):
        """
        ...

-    @webmethod(
-        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET", level=LLAMA_STACK_API_V1ALPHA
    )
--- a/src/llama_stack/apis/files/files.py
+++ b/src/llama_stack/apis/files/files.py
@ -110,7 +110,6 @@ class Files(Protocol):
    """

    # OpenAI Files API Endpoints
-    @webmethod(route="/openai/v1/files", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_upload_file(
        self,
@ -134,7 +133,6 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_files(
        self,
@ -155,7 +153,6 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files/{file_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files/{file_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_file(
        self,
@ -170,7 +167,6 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files/{file_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files/{file_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def openai_delete_file(
        self,
@ -183,7 +179,6 @@ class Files(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/files/{file_id}/content", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files/{file_id}/content", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_file_content(
        self,
--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
@ -1189,7 +1189,6 @@ class InferenceProvider(Protocol):
        raise NotImplementedError("Reranking is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete

-    @webmethod(route="/openai/v1/completions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/completions", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_completion(
        self,
@ -1202,7 +1201,6 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/chat/completions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/chat/completions", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_chat_completion(
        self,
@ -1215,7 +1213,6 @@ class InferenceProvider(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/embeddings", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/embeddings", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_embeddings(
        self,
@ -1240,7 +1237,6 @@ class Inference(InferenceProvider):
    - Rerank models: these models reorder the documents based on their relevance to a query.
    """

-    @webmethod(route="/openai/v1/chat/completions", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/chat/completions", method="GET", level=LLAMA_STACK_API_V1)
    async def list_chat_completions(
        self,
@ -1259,9 +1255,6 @@ class Inference(InferenceProvider):
        """
        raise NotImplementedError("List chat completions is not implemented")

-    @webmethod(
-        route="/openai/v1/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(route="/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
        """Get chat completion.
--- a/src/llama_stack/apis/models/models.py
+++ b/src/llama_stack/apis/models/models.py
@ -107,7 +107,6 @@ class OpenAIListModelsResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class Models(Protocol):
-    @webmethod(route="/models", method="GET", level=LLAMA_STACK_API_V1)
    async def list_models(self) -> ListModelsResponse:
        """List all models.

@ -115,7 +114,7 @@ class Models(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/models", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/models", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_models(self) -> OpenAIListModelsResponse:
        """List models using the OpenAI API.

--- a/src/llama_stack/apis/post_training/post_training.py
+++ b/src/llama_stack/apis/post_training/post_training.py
@ -13,7 +13,7 @@ from pydantic import BaseModel, Field
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.job_types import JobStatus
 from llama_stack.apis.common.training_types import Checkpoint
-from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
+from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -284,7 +284,6 @@ class PostTrainingJobArtifactsResponse(BaseModel):


 class PostTraining(Protocol):
-    @webmethod(route="/post-training/supervised-fine-tune", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/supervised-fine-tune", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def supervised_fine_tune(
        self,
@ -312,7 +311,6 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/preference-optimize", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/preference-optimize", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def preference_optimize(
        self,
@ -335,7 +333,6 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/jobs", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/jobs", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
        """Get all training jobs.
@ -344,7 +341,6 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/job/status", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/job/status", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_training_job_status(self, job_uuid: str) -> PostTrainingJobStatusResponse:
        """Get the status of a training job.
@ -354,7 +350,6 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/job/cancel", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/job/cancel", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def cancel_training_job(self, job_uuid: str) -> None:
        """Cancel a training job.
@ -363,7 +358,6 @@ class PostTraining(Protocol):
        """
        ...

-    @webmethod(route="/post-training/job/artifacts", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/post-training/job/artifacts", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
        """Get the artifacts of a training job.
--- a/src/llama_stack/apis/safety/safety.py
+++ b/src/llama_stack/apis/safety/safety.py
@ -121,7 +121,6 @@ class Safety(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/moderations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1)
    async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
        """Create moderation.
--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -545,7 +545,6 @@ class VectorIO(Protocol):
        ...

    # OpenAI Vector Stores API endpoints
-    @webmethod(route="/openai/v1/vector_stores", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/vector_stores", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_create_vector_store(
        self,
@ -558,7 +557,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(route="/openai/v1/vector_stores", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/vector_stores", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_vector_stores(
        self,
@ -577,9 +575,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(route="/vector_stores/{vector_store_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_vector_store(
        self,
@ -592,9 +587,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}", method="POST", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}",
        method="POST",
@ -617,9 +609,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}",
        method="DELETE",
@ -636,12 +625,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/search",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/search",
        method="POST",
@ -674,12 +657,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files",
        method="POST",
@ -702,12 +679,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files",
        method="GET",
@ -734,12 +705,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}",
        method="GET",
@ -758,12 +723,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}/content",
        method="GET",
@ -782,12 +741,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}",
        method="POST",
@ -808,12 +761,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
-        method="DELETE",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}",
        method="DELETE",
@ -837,12 +784,6 @@ class VectorIO(Protocol):
        method="POST",
        level=LLAMA_STACK_API_V1,
    )
-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/file_batches",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    async def openai_create_vector_store_file_batch(
        self,
        vector_store_id: str,
@ -861,12 +802,6 @@ class VectorIO(Protocol):
        method="GET",
        level=LLAMA_STACK_API_V1,
    )
-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/file_batches/{batch_id}",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    async def openai_retrieve_vector_store_file_batch(
        self,
        batch_id: str,
@ -880,12 +815,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files",
-        method="GET",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}/files",
        method="GET",
@ -914,12 +843,6 @@ class VectorIO(Protocol):
        """
        ...

-    @webmethod(
-        route="/openai/v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel",
-        method="POST",
-        level=LLAMA_STACK_API_V1,
-        deprecated=True,
-    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel",
        method="POST",
--- a/tests/unit/providers/nvidia/test_eval.py
+++ b/tests/unit/providers/nvidia/test_eval.py
@ -5,7 +5,6 @@
 # the root directory of this source tree.

 import os
-import unittest
 from unittest.mock import MagicMock, patch

 import pytest
@ -13,6 +12,8 @@ import pytest
 from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.common.job_types import Job, JobStatus
 from llama_stack.apis.eval.eval import BenchmarkConfig, EvaluateResponse, ModelCandidate, SamplingParams
+from llama_stack.apis.inference.inference import TopPSamplingStrategy
+from llama_stack.apis.resource import ResourceType
 from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
 from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
@ -21,193 +22,200 @@ MOCK_DATASET_ID = "default/test-dataset"
 MOCK_BENCHMARK_ID = "test-benchmark"


-class TestNVIDIAEvalImpl(unittest.TestCase):
-    def setUp(self):
-        os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
+@pytest.fixture
+def nvidia_eval_setup():
+    """Set up the NVIDIA eval implementation with mocked dependencies."""
+    os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"

-        # Create mock APIs
-        self.datasetio_api = MagicMock()
-        self.datasets_api = MagicMock()
-        self.scoring_api = MagicMock()
-        self.inference_api = MagicMock()
-        self.agents_api = MagicMock()
+    # Create mock APIs
+    datasetio_api = MagicMock()
+    datasets_api = MagicMock()
+    scoring_api = MagicMock()
+    inference_api = MagicMock()
+    agents_api = MagicMock()

-        self.config = NVIDIAEvalConfig(
-            evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"],
-        )
+    config = NVIDIAEvalConfig(
+        evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"],
+    )

-        self.eval_impl = NVIDIAEvalImpl(
-            config=self.config,
-            datasetio_api=self.datasetio_api,
-            datasets_api=self.datasets_api,
-            scoring_api=self.scoring_api,
-            inference_api=self.inference_api,
-            agents_api=self.agents_api,
-        )
+    eval_impl = NVIDIAEvalImpl(
+        config=config,
+        datasetio_api=datasetio_api,
+        datasets_api=datasets_api,
+        scoring_api=scoring_api,
+        inference_api=inference_api,
+        agents_api=agents_api,
+    )

-        # Mock the HTTP request methods
-        self.evaluator_get_patcher = patch(
-            "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get"
-        )
-        self.evaluator_post_patcher = patch(
-            "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post"
-        )
-        self.evaluator_delete_patcher = patch(
-            "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_delete"
-        )
-
-        self.mock_evaluator_get = self.evaluator_get_patcher.start()
-        self.mock_evaluator_post = self.evaluator_post_patcher.start()
-        self.mock_evaluator_delete = self.evaluator_delete_patcher.start()
-
-    def tearDown(self):
-        """Clean up after each test."""
-        self.evaluator_get_patcher.stop()
-        self.evaluator_post_patcher.stop()
-        self.evaluator_delete_patcher.stop()
-
-    def _assert_request_body(self, expected_json):
-        """Helper method to verify request body in Evaluator POST request is correct"""
-        call_args = self.mock_evaluator_post.call_args
-        actual_json = call_args[0][1]
-
-        # Check that all expected keys contain the expected values in the actual JSON
-        for key, value in expected_json.items():
-            assert key in actual_json, f"Key '{key}' missing in actual JSON"
-
-            if isinstance(value, dict):
-                for nested_key, nested_value in value.items():
-                    assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
-                    assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
-            else:
-                assert actual_json[key] == value, f"Value mismatch for '{key}'"
-
-    @pytest.fixture(autouse=True)
-    def inject_fixtures(self, run_async):
-        self.run_async = run_async
-
-    def test_register_benchmark(self):
-        eval_config = {
-            "type": "custom",
-            "params": {"parallelism": 8},
-            "tasks": {
-                "qa": {
-                    "type": "completion",
-                    "params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
-                    "dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
-                    "metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
-                }
-            },
+    # Mock the HTTP request methods
+    with (
+        patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get") as mock_evaluator_get,
+        patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post") as mock_evaluator_post,
+    ):
+        yield {
+            "eval_impl": eval_impl,
+            "mock_evaluator_get": mock_evaluator_get,
+            "mock_evaluator_post": mock_evaluator_post,
+            "datasetio_api": datasetio_api,
+            "datasets_api": datasets_api,
+            "scoring_api": scoring_api,
+            "inference_api": inference_api,
+            "agents_api": agents_api,
        }

-        benchmark = Benchmark(
-            provider_id="nvidia",
-            type="benchmark",
-            identifier=MOCK_BENCHMARK_ID,
-            dataset_id=MOCK_DATASET_ID,
-            scoring_functions=["basic::equality"],
-            metadata=eval_config,
-        )

-        # Mock Evaluator API response
-        mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
-        self.mock_evaluator_post.return_value = mock_evaluator_response
+def _assert_request_body(mock_evaluator_post, expected_json):
+    """Helper method to verify request body in Evaluator POST request is correct"""
+    call_args = mock_evaluator_post.call_args
+    actual_json = call_args[0][1]

-        # Register the benchmark
-        self.run_async(self.eval_impl.register_benchmark(benchmark))
+    # Check that all expected keys contain the expected values in the actual JSON
+    for key, value in expected_json.items():
+        assert key in actual_json, f"Key '{key}' missing in actual JSON"

-        # Verify the Evaluator API was called correctly
-        self.mock_evaluator_post.assert_called_once()
-        self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config})
+        if isinstance(value, dict):
+            for nested_key, nested_value in value.items():
+                assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
+                assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
+        else:
+            assert actual_json[key] == value, f"Value mismatch for '{key}'"

-    def test_unregister_benchmark(self):
-        # Unregister the benchmark
-        self.run_async(self.eval_impl.unregister_benchmark(benchmark_id=MOCK_BENCHMARK_ID))

-        # Verify the Evaluator API was called correctly
-        self.mock_evaluator_delete.assert_called_once_with(f"/v1/evaluation/configs/nvidia/{MOCK_BENCHMARK_ID}")
+async def test_register_benchmark(nvidia_eval_setup):
+    eval_impl = nvidia_eval_setup["eval_impl"]
+    mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]

-    def test_run_eval(self):
-        benchmark_config = BenchmarkConfig(
-            eval_candidate=ModelCandidate(
-                type="model",
-                model=CoreModelId.llama3_1_8b_instruct.value,
-                sampling_params=SamplingParams(max_tokens=100, temperature=0.7),
-            )
-        )
-
-        # Mock Evaluator API response
-        mock_evaluator_response = {"id": "job-123", "status": "created"}
-        self.mock_evaluator_post.return_value = mock_evaluator_response
-
-        # Run the Evaluation job
-        result = self.run_async(
-            self.eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
-        )
-
-        # Verify the Evaluator API was called correctly
-        self.mock_evaluator_post.assert_called_once()
-        self._assert_request_body(
-            {
-                "config": f"nvidia/{MOCK_BENCHMARK_ID}",
-                "target": {"type": "model", "model": "Llama3.1-8B-Instruct"},
+    eval_config = {
+        "type": "custom",
+        "params": {"parallelism": 8},
+        "tasks": {
+            "qa": {
+                "type": "completion",
+                "params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
+                "dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
+                "metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
            }
+        },
+    }
+
+    benchmark = Benchmark(
+        provider_id="nvidia",
+        type=ResourceType.benchmark,
+        identifier=MOCK_BENCHMARK_ID,
+        dataset_id=MOCK_DATASET_ID,
+        scoring_functions=["basic::equality"],
+        metadata=eval_config,
+    )
+
+    # Mock Evaluator API response
+    mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
+    mock_evaluator_post.return_value = mock_evaluator_response
+
+    # Register the benchmark
+    await eval_impl.register_benchmark(benchmark)
+
+    # Verify the Evaluator API was called correctly
+    mock_evaluator_post.assert_called_once()
+    _assert_request_body(
+        mock_evaluator_post, {"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config}
+    )
+
+
+async def test_run_eval(nvidia_eval_setup):
+    eval_impl = nvidia_eval_setup["eval_impl"]
+    mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
+
+    benchmark_config = BenchmarkConfig(
+        eval_candidate=ModelCandidate(
+            type="model",
+            model=CoreModelId.llama3_1_8b_instruct.value,
+            sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
        )
+    )

-        # Verify the result
-        assert isinstance(result, Job)
-        assert result.job_id == "job-123"
-        assert result.status == JobStatus.in_progress
+    # Mock Evaluator API response
+    mock_evaluator_response = {"id": "job-123", "status": "created"}
+    mock_evaluator_post.return_value = mock_evaluator_response

-    def test_job_status(self):
-        # Mock Evaluator API response
-        mock_evaluator_response = {"id": "job-123", "status": "completed"}
-        self.mock_evaluator_get.return_value = mock_evaluator_response
+    # Run the Evaluation job
+    result = await eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)

-        # Get the Evaluation job
-        result = self.run_async(self.eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
+    # Verify the Evaluator API was called correctly
+    mock_evaluator_post.assert_called_once()
+    _assert_request_body(
+        mock_evaluator_post,
+        {
+            "config": f"nvidia/{MOCK_BENCHMARK_ID}",
+            "target": {"type": "model", "model": "Llama3.1-8B-Instruct"},
+        },
+    )

-        # Verify the result
-        assert isinstance(result, Job)
-        assert result.job_id == "job-123"
-        assert result.status == JobStatus.completed
+    # Verify the result
+    assert isinstance(result, Job)
+    assert result.job_id == "job-123"
+    assert result.status == JobStatus.in_progress

-        # Verify the API was called correctly
-        self.mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")

-    def test_job_cancel(self):
-        # Mock Evaluator API response
-        mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
-        self.mock_evaluator_post.return_value = mock_evaluator_response
+async def test_job_status(nvidia_eval_setup):
+    eval_impl = nvidia_eval_setup["eval_impl"]
+    mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]

-        # Cancel the Evaluation job
-        self.run_async(self.eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
+    # Mock Evaluator API response
+    mock_evaluator_response = {"id": "job-123", "status": "completed"}
+    mock_evaluator_get.return_value = mock_evaluator_response

-        # Verify the API was called correctly
-        self.mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
+    # Get the Evaluation job
+    result = await eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")

-    def test_job_result(self):
-        # Mock Evaluator API responses
-        mock_job_status_response = {"id": "job-123", "status": "completed"}
-        mock_job_results_response = {
-            "id": "job-123",
-            "status": "completed",
-            "results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
-        }
-        self.mock_evaluator_get.side_effect = [
-            mock_job_status_response,  # First call to retrieve job
-            mock_job_results_response,  # Second call to retrieve job results
-        ]
+    # Verify the result
+    assert isinstance(result, Job)
+    assert result.job_id == "job-123"
+    assert result.status == JobStatus.completed

-        # Get the Evaluation job results
-        result = self.run_async(self.eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
+    # Verify the API was called correctly
+    mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")

-        # Verify the result
-        assert isinstance(result, EvaluateResponse)
-        assert MOCK_BENCHMARK_ID in result.scores
-        assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85

-        # Verify the API was called correctly
-        assert self.mock_evaluator_get.call_count == 2
-        self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
-        self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")
+async def test_job_cancel(nvidia_eval_setup):
+    eval_impl = nvidia_eval_setup["eval_impl"]
+    mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
+
+    # Mock Evaluator API response
+    mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
+    mock_evaluator_post.return_value = mock_evaluator_response
+
+    # Cancel the Evaluation job
+    await eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
+
+    # Verify the API was called correctly
+    mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
+
+
+async def test_job_result(nvidia_eval_setup):
+    eval_impl = nvidia_eval_setup["eval_impl"]
+    mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
+
+    # Mock Evaluator API responses
+    mock_job_status_response = {"id": "job-123", "status": "completed"}
+    mock_job_results_response = {
+        "id": "job-123",
+        "status": "completed",
+        "results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
+    }
+    mock_evaluator_get.side_effect = [
+        mock_job_status_response,  # First call to retrieve job
+        mock_job_results_response,  # Second call to retrieve job results
+    ]
+
+    # Get the Evaluation job results
+    result = await eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")
+
+    # Verify the result
+    assert isinstance(result, EvaluateResponse)
+    assert MOCK_BENCHMARK_ID in result.scores
+    assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
+
+    # Verify the API was called correctly
+    assert mock_evaluator_get.call_count == 2
+    mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
+    mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")