feat: introduce APIs for retrieving chat completion requests (#2145)

# What does this PR do? This PR introduces APIs to retrieve past chat completion requests, which will be used in the LS UI. Our current `Telemetry` is ill-suited for this purpose as it's untyped so we'd need to filter by obscure attribute names, making it brittle. Since these APIs are 'provided by stack' and don't need to be implemented by inference providers, we introduce a new InferenceProvider class, containing the existing inference protocol, which is implemented by inference providers. The APIs are OpenAI-compliant, with an additional `input_messages` field. ## Test Plan This PR just adds the API and marks them provided_by_stack. S tart stack server -> doesn't crash
2025-05-18 21:43:19 -07:00 · 2025-05-18 21:43:19 -07:00 · 047303e339
commit 047303e339
parent c7015d3d60
15 changed files with 1356 additions and 869 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -827,6 +827,35 @@ paths:
          required: true
          schema:
            type: string
  /v1/openai/v1/chat/completions/{completion_id}:
    get:
      responses:
        '200':
          description: A OpenAICompletionWithInputMessages.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/OpenAICompletionWithInputMessages'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: Describe a chat completion by its ID.
      parameters:
        - name: completion_id
          in: path
          description: ID of the chat completion.
          required: true
          schema:
            type: string
  /v1/datasets/{dataset_id}:
    get:
      responses:
@ -1795,6 +1824,89 @@ paths:
            schema:
              $ref: '#/components/schemas/RegisterBenchmarkRequest'
        required: true
  /v1/openai/v1/chat/completions:
    get:
      responses:
        '200':
          description: A ListOpenAIChatCompletionResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListOpenAIChatCompletionResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: List all chat completions.
      parameters:
        - name: after
          in: query
          description: >-
            The ID of the last chat completion to return.
          required: false
          schema:
            type: string
        - name: limit
          in: query
          description: >-
            The maximum number of chat completions to return.
          required: false
          schema:
            type: integer
        - name: model
          in: query
          description: The model to filter by.
          required: false
          schema:
            type: string
        - name: order
          in: query
          description: >-
            The order to sort the chat completions by: "asc" or "desc". Defaults to
            "desc".
          required: false
          schema:
            $ref: '#/components/schemas/Order'
    post:
      responses:
        '200':
          description: An OpenAIChatCompletion.
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/OpenAIChatCompletion'
                  - $ref: '#/components/schemas/OpenAIChatCompletionChunk'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: >-
        Generate an OpenAI-compatible chat completion for the given messages using
        the specified model.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/OpenaiChatCompletionRequest'
        required: true
  /v1/datasets:
    get:
      responses:
@ -2261,39 +2373,6 @@ paths:
            schema:
              $ref: '#/components/schemas/LogEventRequest'
        required: true
  /v1/openai/v1/chat/completions:
    post:
      responses:
        '200':
          description: An OpenAIChatCompletion.
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/OpenAIChatCompletion'
                  - $ref: '#/components/schemas/OpenAIChatCompletionChunk'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: >-
        Generate an OpenAI-compatible chat completion for the given messages using
        the specified model.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/OpenaiChatCompletionRequest'
        required: true
  /v1/openai/v1/completions:
    post:
      responses:
@ -5479,6 +5558,369 @@ components:
        - scoring_functions
        - metadata
      title: Benchmark
    OpenAIAssistantMessageParam:
      type: object
      properties:
        role:
          type: string
          const: assistant
          default: assistant
          description: >-
            Must be "assistant" to identify this as the model's response
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The content of the model's response
        name:
          type: string
          description: >-
            (Optional) The name of the assistant message participant.
        tool_calls:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIChatCompletionToolCall'
          description: >-
            List of tool calls. Each tool call is an OpenAIChatCompletionToolCall
            object.
      additionalProperties: false
      required:
        - role
      title: OpenAIAssistantMessageParam
      description: >-
        A message containing the model's (assistant) response in an OpenAI-compatible
        chat completion request.
    "OpenAIChatCompletionContentPartImageParam":
      type: object
      properties:
        type:
          type: string
          const: image_url
          default: image_url
        image_url:
          $ref: '#/components/schemas/OpenAIImageURL'
      additionalProperties: false
      required:
        - type
        - image_url
      title: >-
        OpenAIChatCompletionContentPartImageParam
    OpenAIChatCompletionContentPartParam:
      oneOf:
        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
      discriminator:
        propertyName: type
        mapping:
          text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
          image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
    OpenAIChatCompletionContentPartTextParam:
      type: object
      properties:
        type:
          type: string
          const: text
          default: text
        text:
          type: string
      additionalProperties: false
      required:
        - type
        - text
      title: OpenAIChatCompletionContentPartTextParam
    OpenAIChatCompletionToolCall:
      type: object
      properties:
        index:
          type: integer
        id:
          type: string
        type:
          type: string
          const: function
          default: function
        function:
          $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction'
      additionalProperties: false
      required:
        - type
      title: OpenAIChatCompletionToolCall
    OpenAIChatCompletionToolCallFunction:
      type: object
      properties:
        name:
          type: string
        arguments:
          type: string
      additionalProperties: false
      title: OpenAIChatCompletionToolCallFunction
    OpenAIChoice:
      type: object
      properties:
        message:
          $ref: '#/components/schemas/OpenAIMessageParam'
          description: The message from the model
        finish_reason:
          type: string
          description: The reason the model stopped generating
        index:
          type: integer
          description: The index of the choice
        logprobs:
          $ref: '#/components/schemas/OpenAIChoiceLogprobs'
          description: >-
            (Optional) The log probabilities for the tokens in the message
      additionalProperties: false
      required:
        - message
        - finish_reason
        - index
      title: OpenAIChoice
      description: >-
        A choice from an OpenAI-compatible chat completion response.
    OpenAIChoiceLogprobs:
      type: object
      properties:
        content:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITokenLogProb'
          description: >-
            (Optional) The log probabilities for the tokens in the message
        refusal:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITokenLogProb'
          description: >-
            (Optional) The log probabilities for the tokens in the message
      additionalProperties: false
      title: OpenAIChoiceLogprobs
      description: >-
        The log probabilities for the tokens in the message from an OpenAI-compatible
        chat completion response.
    OpenAIDeveloperMessageParam:
      type: object
      properties:
        role:
          type: string
          const: developer
          default: developer
          description: >-
            Must be "developer" to identify this as a developer message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The content of the developer message
        name:
          type: string
          description: >-
            (Optional) The name of the developer message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIDeveloperMessageParam
      description: >-
        A message from the developer in an OpenAI-compatible chat completion request.
    OpenAIImageURL:
      type: object
      properties:
        url:
          type: string
        detail:
          type: string
      additionalProperties: false
      required:
        - url
      title: OpenAIImageURL
    OpenAIMessageParam:
      oneOf:
        - $ref: '#/components/schemas/OpenAIUserMessageParam'
        - $ref: '#/components/schemas/OpenAISystemMessageParam'
        - $ref: '#/components/schemas/OpenAIAssistantMessageParam'
        - $ref: '#/components/schemas/OpenAIToolMessageParam'
        - $ref: '#/components/schemas/OpenAIDeveloperMessageParam'
      discriminator:
        propertyName: role
        mapping:
          user: '#/components/schemas/OpenAIUserMessageParam'
          system: '#/components/schemas/OpenAISystemMessageParam'
          assistant: '#/components/schemas/OpenAIAssistantMessageParam'
          tool: '#/components/schemas/OpenAIToolMessageParam'
          developer: '#/components/schemas/OpenAIDeveloperMessageParam'
    OpenAISystemMessageParam:
      type: object
      properties:
        role:
          type: string
          const: system
          default: system
          description: >-
            Must be "system" to identify this as a system message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: >-
            The content of the "system prompt". If multiple system messages are provided,
            they are concatenated. The underlying Llama Stack code may also add other
            system messages (for example, for formatting tool definitions).
        name:
          type: string
          description: >-
            (Optional) The name of the system message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAISystemMessageParam
      description: >-
        A system message providing instructions or context to the model.
    OpenAITokenLogProb:
      type: object
      properties:
        token:
          type: string
        bytes:
          type: array
          items:
            type: integer
        logprob:
          type: number
        top_logprobs:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITopLogProb'
      additionalProperties: false
      required:
        - token
        - logprob
        - top_logprobs
      title: OpenAITokenLogProb
      description: >-
        The log probability for a token from an OpenAI-compatible chat completion
        response.
    OpenAIToolMessageParam:
      type: object
      properties:
        role:
          type: string
          const: tool
          default: tool
          description: >-
            Must be "tool" to identify this as a tool response
        tool_call_id:
          type: string
          description: >-
            Unique identifier for the tool call this response is for
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The response content from the tool
      additionalProperties: false
      required:
        - role
        - tool_call_id
        - content
      title: OpenAIToolMessageParam
      description: >-
        A message representing the result of a tool invocation in an OpenAI-compatible
        chat completion request.
    OpenAITopLogProb:
      type: object
      properties:
        token:
          type: string
        bytes:
          type: array
          items:
            type: integer
        logprob:
          type: number
      additionalProperties: false
      required:
        - token
        - logprob
      title: OpenAITopLogProb
      description: >-
        The top log probability for a token from an OpenAI-compatible chat completion
        response.
    OpenAIUserMessageParam:
      type: object
      properties:
        role:
          type: string
          const: user
          default: user
          description: >-
            Must be "user" to identify this as a user message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: >-
            The content of the message, which can include text and other media
        name:
          type: string
          description: >-
            (Optional) The name of the user message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIUserMessageParam
      description: >-
        A message from the user in an OpenAI-compatible chat completion request.
    OpenAICompletionWithInputMessages:
      type: object
      properties:
        id:
          type: string
          description: The ID of the chat completion
        choices:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIChoice'
          description: List of choices
        object:
          type: string
          const: chat.completion
          default: chat.completion
          description: >-
            The object type, which will be "chat.completion"
        created:
          type: integer
          description: >-
            The Unix timestamp in seconds when the chat completion was created
        model:
          type: string
          description: >-
            The model that was used to generate the chat completion
        input_messages:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIMessageParam'
      additionalProperties: false
      required:
        - id
        - choices
        - object
        - created
        - model
        - input_messages
      title: OpenAICompletionWithInputMessages
    DataSource:
      oneOf:
        - $ref: '#/components/schemas/URIDataSource'
@ -6497,6 +6939,73 @@ components:
      required:
        - data
      title: ListBenchmarksResponse
    Order:
      type: string
      enum:
        - asc
        - desc
      title: Order
    ListOpenAIChatCompletionResponse:
      type: object
      properties:
        data:
          type: array
          items:
            type: object
            properties:
              id:
                type: string
                description: The ID of the chat completion
              choices:
                type: array
                items:
                  $ref: '#/components/schemas/OpenAIChoice'
                description: List of choices
              object:
                type: string
                const: chat.completion
                default: chat.completion
                description: >-
                  The object type, which will be "chat.completion"
              created:
                type: integer
                description: >-
                  The Unix timestamp in seconds when the chat completion was created
              model:
                type: string
                description: >-
                  The model that was used to generate the chat completion
              input_messages:
                type: array
                items:
                  $ref: '#/components/schemas/OpenAIMessageParam'
            additionalProperties: false
            required:
              - id
              - choices
              - object
              - created
              - model
              - input_messages
            title: OpenAICompletionWithInputMessages
        has_more:
          type: boolean
        first_id:
          type: string
        last_id:
          type: string
        object:
          type: string
          const: list
          default: list
      additionalProperties: false
      required:
        - data
        - has_more
        - first_id
        - last_id
        - object
      title: ListOpenAIChatCompletionResponse
    ListDatasetsResponse:
      type: object
      properties:
@ -6835,142 +7344,6 @@ components:
        - event
        - ttl_seconds
      title: LogEventRequest
    OpenAIAssistantMessageParam:
      type: object
      properties:
        role:
          type: string
          const: assistant
          default: assistant
          description: >-
            Must be "assistant" to identify this as the model's response
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The content of the model's response
        name:
          type: string
          description: >-
            (Optional) The name of the assistant message participant.
        tool_calls:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIChatCompletionToolCall'
          description: >-
            List of tool calls. Each tool call is an OpenAIChatCompletionToolCall
            object.
      additionalProperties: false
      required:
        - role
      title: OpenAIAssistantMessageParam
      description: >-
        A message containing the model's (assistant) response in an OpenAI-compatible
        chat completion request.
    "OpenAIChatCompletionContentPartImageParam":
      type: object
      properties:
        type:
          type: string
          const: image_url
          default: image_url
        image_url:
          $ref: '#/components/schemas/OpenAIImageURL'
      additionalProperties: false
      required:
        - type
        - image_url
      title: >-
        OpenAIChatCompletionContentPartImageParam
    OpenAIChatCompletionContentPartParam:
      oneOf:
        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
      discriminator:
        propertyName: type
        mapping:
          text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
          image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
    OpenAIChatCompletionContentPartTextParam:
      type: object
      properties:
        type:
          type: string
          const: text
          default: text
        text:
          type: string
      additionalProperties: false
      required:
        - type
        - text
      title: OpenAIChatCompletionContentPartTextParam
    OpenAIChatCompletionToolCall:
      type: object
      properties:
        index:
          type: integer
        id:
          type: string
        type:
          type: string
          const: function
          default: function
        function:
          $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction'
      additionalProperties: false
      required:
        - type
      title: OpenAIChatCompletionToolCall
    OpenAIChatCompletionToolCallFunction:
      type: object
      properties:
        name:
          type: string
        arguments:
          type: string
      additionalProperties: false
      title: OpenAIChatCompletionToolCallFunction
    OpenAIDeveloperMessageParam:
      type: object
      properties:
        role:
          type: string
          const: developer
          default: developer
          description: >-
            Must be "developer" to identify this as a developer message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The content of the developer message
        name:
          type: string
          description: >-
            (Optional) The name of the developer message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIDeveloperMessageParam
      description: >-
        A message from the developer in an OpenAI-compatible chat completion request.
    OpenAIImageURL:
      type: object
      properties:
        url:
          type: string
        detail:
          type: string
      additionalProperties: false
      required:
        - url
      title: OpenAIImageURL
    OpenAIJSONSchema:
      type: object
      properties:
@ -6994,21 +7367,6 @@ components:
      required:
        - name
      title: OpenAIJSONSchema
    OpenAIMessageParam:
      oneOf:
        - $ref: '#/components/schemas/OpenAIUserMessageParam'
        - $ref: '#/components/schemas/OpenAISystemMessageParam'
        - $ref: '#/components/schemas/OpenAIAssistantMessageParam'
        - $ref: '#/components/schemas/OpenAIToolMessageParam'
        - $ref: '#/components/schemas/OpenAIDeveloperMessageParam'
      discriminator:
        propertyName: role
        mapping:
          user: '#/components/schemas/OpenAIUserMessageParam'
          system: '#/components/schemas/OpenAISystemMessageParam'
          assistant: '#/components/schemas/OpenAIAssistantMessageParam'
          tool: '#/components/schemas/OpenAIToolMessageParam'
          developer: '#/components/schemas/OpenAIDeveloperMessageParam'
    OpenAIResponseFormatJSONObject:
      type: object
      properties:
@ -7056,93 +7414,6 @@ components:
      required:
        - type
      title: OpenAIResponseFormatText
    OpenAISystemMessageParam:
      type: object
      properties:
        role:
          type: string
          const: system
          default: system
          description: >-
            Must be "system" to identify this as a system message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: >-
            The content of the "system prompt". If multiple system messages are provided,
            they are concatenated. The underlying Llama Stack code may also add other
            system messages (for example, for formatting tool definitions).
        name:
          type: string
          description: >-
            (Optional) The name of the system message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAISystemMessageParam
      description: >-
        A system message providing instructions or context to the model.
    OpenAIToolMessageParam:
      type: object
      properties:
        role:
          type: string
          const: tool
          default: tool
          description: >-
            Must be "tool" to identify this as a tool response
        tool_call_id:
          type: string
          description: >-
            Unique identifier for the tool call this response is for
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The response content from the tool
      additionalProperties: false
      required:
        - role
        - tool_call_id
        - content
      title: OpenAIToolMessageParam
      description: >-
        A message representing the result of a tool invocation in an OpenAI-compatible
        chat completion request.
    OpenAIUserMessageParam:
      type: object
      properties:
        role:
          type: string
          const: user
          default: user
          description: >-
            Must be "user" to identify this as a user message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: >-
            The content of the message, which can include text and other media
        name:
          type: string
          description: >-
            (Optional) The name of the user message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIUserMessageParam
      description: >-
        A message from the user in an OpenAI-compatible chat completion request.
    OpenaiChatCompletionRequest:
      type: object
      properties:
@ -7356,30 +7627,6 @@ components:
      title: OpenAIChatCompletionChunk
      description: >-
        Chunk from a streaming response to an OpenAI-compatible chat completion request.
    OpenAIChoice:
      type: object
      properties:
        message:
          $ref: '#/components/schemas/OpenAIMessageParam'
          description: The message from the model
        finish_reason:
          type: string
          description: The reason the model stopped generating
        index:
          type: integer
          description: The index of the choice
        logprobs:
          $ref: '#/components/schemas/OpenAIChoiceLogprobs'
          description: >-
            (Optional) The log probabilities for the tokens in the message
      additionalProperties: false
      required:
        - message
        - finish_reason
        - index
      title: OpenAIChoice
      description: >-
        A choice from an OpenAI-compatible chat completion response.
    OpenAIChoiceDelta:
      type: object
      properties:
@ -7401,26 +7648,6 @@ components:
      title: OpenAIChoiceDelta
      description: >-
        A delta from an OpenAI-compatible chat completion streaming response.
    OpenAIChoiceLogprobs:
      type: object
      properties:
        content:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITokenLogProb'
          description: >-
            (Optional) The log probabilities for the tokens in the message
        refusal:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITokenLogProb'
          description: >-
            (Optional) The log probabilities for the tokens in the message
      additionalProperties: false
      title: OpenAIChoiceLogprobs
      description: >-
        The log probabilities for the tokens in the message from an OpenAI-compatible
        chat completion response.
    OpenAIChunkChoice:
      type: object
      properties:
@ -7445,49 +7672,6 @@ components:
      title: OpenAIChunkChoice
      description: >-
        A chunk choice from an OpenAI-compatible chat completion streaming response.
    OpenAITokenLogProb:
      type: object
      properties:
        token:
          type: string
        bytes:
          type: array
          items:
            type: integer
        logprob:
          type: number
        top_logprobs:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITopLogProb'
      additionalProperties: false
      required:
        - token
        - logprob
        - top_logprobs
      title: OpenAITokenLogProb
      description: >-
        The log probability for a token from an OpenAI-compatible chat completion
        response.
    OpenAITopLogProb:
      type: object
      properties:
        token:
          type: string
        bytes:
          type: array
          items:
            type: integer
        logprob:
          type: number
      additionalProperties: false
      required:
        - token
        - logprob
      title: OpenAITopLogProb
      description: >-
        The top log probability for a token from an OpenAI-compatible chat completion
        response.
    OpenaiCompletionRequest:
      type: object
      properties:
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -759,7 +759,7 @@ class Generator:
        )
        return Operation(
-            tags=[op.defining_class.__name__],
+            tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
            summary=None,
            # summary=doc_string.short_description,
            description=description,
@ -805,6 +805,8 @@ class Generator:
        operation_tags: List[Tag] = []
        for cls in endpoint_classes:
            doc_string = parse_type(cls)
            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
                continue
            operation_tags.append(
                Tag(
                    name=cls.__name__,
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -820,15 +820,32 @@ class BatchChatCompletionResponse(BaseModel):
    batch: list[ChatCompletionResponse]
 class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
    input_messages: list[OpenAIMessageParam]
@json_schema_type
 class ListOpenAIChatCompletionResponse(BaseModel):
    data: list[OpenAICompletionWithInputMessages]
    has_more: bool
    first_id: str
    last_id: str
    object: Literal["list"] = "list"
 class Order(Enum):
    asc = "asc"
    desc = "desc"
@runtime_checkable
@trace_protocol
-class Inference(Protocol):
+class InferenceProvider(Protocol):
    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
    - Embedding models: these models generate embeddings to be used for semantic search.
    """
    This protocol defines the interface that should be implemented by all inference providers.
    """
    API_NAMESPACE: str = "Inference"
    model_store: ModelStore | None = None
@ -1062,3 +1079,39 @@ class Inference(Protocol):
        :returns: An OpenAIChatCompletion.
        """
        ...
 class Inference(InferenceProvider):
    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
    - Embedding models: these models generate embeddings to be used for semantic search.
    """
    @webmethod(route="/openai/v1/chat/completions", method="GET")
    async def list_chat_completions(
        self,
        after: str | None = None,
        limit: int | None = 20,
        model: str | None = None,
        order: Order | None = Order.desc,
    ) -> ListOpenAIChatCompletionResponse:
        """List all chat completions.
        :param after: The ID of the last chat completion to return.
        :param limit: The maximum number of chat completions to return.
        :param model: The model to filter by.
        :param order: The order to sort the chat completions by: "asc" or "desc". Defaults to "desc".
        :returns: A ListOpenAIChatCompletionResponse.
        """
        raise NotImplementedError("List chat completions is not implemented")
    @webmethod(route="/openai/v1/chat/completions/{completion_id}", method="GET")
    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
        """Describe a chat completion by its ID.
        :param completion_id: ID of the chat completion.
        :returns: A OpenAICompletionWithInputMessages.
        """
        raise NotImplementedError("Get chat completion is not implemented")
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -13,7 +13,7 @@ from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
 from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import Inference, InferenceProvider
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
@ -83,6 +83,13 @@ def api_protocol_map() -> dict[Api, Any]:
    }
 def api_protocol_map_for_compliance_check() -> dict[Api, Any]:
    return {
        **api_protocol_map(),
        Api.inference: InferenceProvider,
    }
 def additional_protocols_map() -> dict[Api, Any]:
    return {
        Api.inference: (ModelsProtocolPrivate, Models, Api.models),
@ -302,9 +309,6 @@ async def instantiate_provider(
    inner_impls: dict[str, Any],
    dist_registry: DistributionRegistry,
 ):
    protocols = api_protocol_map()
    additional_protocols = additional_protocols_map()
    provider_spec = provider.spec
    if not hasattr(provider_spec, "module"):
        raise AttributeError(f"ProviderSpec of type {type(provider_spec)} does not have a 'module' attribute")
@ -342,6 +346,8 @@ async def instantiate_provider(
    impl.__provider_spec__ = provider_spec
    impl.__provider_config__ = config
    protocols = api_protocol_map_for_compliance_check()
    additional_protocols = additional_protocols_map()
    # TODO: check compliance for special tool groups
    # the impl should be for Api.tool_runtime, the name should be the special tool group, the protocol should be the special tool group protocol
    check_protocol_compliance(impl, protocols[provider_spec.api])
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
    CompletionRequest,
    CompletionResponse,
    CompletionResponseStreamChunk,
-    Inference,
+    InferenceProvider,
    InterleavedContent,
    LogProbConfig,
    Message,
@ -86,7 +86,7 @@ class MetaReferenceInferenceImpl(
    OpenAICompletionToLlamaStackMixin,
    OpenAIChatCompletionToLlamaStackMixin,
    SentenceTransformerEmbeddingMixin,
-    Inference,
+    InferenceProvider,
    ModelsProtocolPrivate,
 ):
    def __init__(self, config: MetaReferenceInferenceConfig) -> None:
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -9,7 +9,7 @@ from collections.abc import AsyncGenerator
 from llama_stack.apis.inference import (
    CompletionResponse,
-    Inference,
+    InferenceProvider,
    InterleavedContent,
    LogProbConfig,
    Message,
@ -38,7 +38,7 @@ class SentenceTransformersInferenceImpl(
    OpenAIChatCompletionToLlamaStackMixin,
    OpenAICompletionToLlamaStackMixin,
    SentenceTransformerEmbeddingMixin,
-    Inference,
+    InferenceProvider,
    ModelsProtocolPrivate,
 ):
    def __init__(self, config: SentenceTransformersInferenceConfig) -> None:
--- a/llama_stack/providers/remote/inference/cerebras_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import CerebrasCompatConfig
-async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .cerebras import CerebrasCompatInferenceAdapter
--- a/llama_stack/providers/remote/inference/fireworks_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import FireworksCompatConfig
-async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .fireworks import FireworksCompatInferenceAdapter
--- a/llama_stack/providers/remote/inference/groq_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/groq_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import GroqCompatConfig
-async def get_adapter_impl(config: GroqCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: GroqCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .groq import GroqCompatInferenceAdapter
--- a/llama_stack/providers/remote/inference/llama_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import LlamaCompatConfig
-async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .llama import LlamaCompatInferenceAdapter
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
    EmbeddingsResponse,
    EmbeddingTaskType,
    GrammarResponseFormat,
-    Inference,
+    InferenceProvider,
    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
@ -82,7 +82,7 @@ logger = get_logger(name=__name__, category="inference")
 class OllamaInferenceAdapter(
-    Inference,
+    InferenceProvider,
    ModelsProtocolPrivate,
 ):
    def __init__(self, url: str) -> None:
--- a/llama_stack/providers/remote/inference/sambanova_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import SambaNovaCompatConfig
-async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .sambanova import SambaNovaCompatInferenceAdapter
--- a/llama_stack/providers/remote/inference/together_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/together_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import TogetherCompatConfig
-async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .together import TogetherCompatInferenceAdapter
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -19,7 +19,7 @@ from llama_stack.apis.inference import (
    ChatCompletionResponseStreamChunk,
    EmbeddingsResponse,
    EmbeddingTaskType,
-    Inference,
+    InferenceProvider,
    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
@ -59,7 +59,7 @@ logger = get_logger(name=__name__, category="inference")
 class LiteLLMOpenAIMixin(
    ModelRegistryHelper,
-    Inference,
+    InferenceProvider,
    NeedsRequestProviderData,
 ):
    # TODO: avoid exposing the litellm specific model names to the user.