Merge branch 'main' into patch-metadata

2025-08-01 16:24:44 +00:00 · 2025-05-20 03:08:53 -06:00 · 2025-05-20 03:08:53 -06:00 · f0a142f5a8
commit f0a142f5a8
parent 5a807da6af 6d20b720b8
21 changed files with 1405 additions and 887 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -827,6 +827,35 @@ paths:
          required: true
          schema:
            type: string
  /v1/openai/v1/chat/completions/{completion_id}:
    get:
      responses:
        '200':
          description: A OpenAICompletionWithInputMessages.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/OpenAICompletionWithInputMessages'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: Describe a chat completion by its ID.
      parameters:
        - name: completion_id
          in: path
          description: ID of the chat completion.
          required: true
          schema:
            type: string
  /v1/datasets/{dataset_id}:
    get:
      responses:
@ -1795,6 +1824,89 @@ paths:
            schema:
              $ref: '#/components/schemas/RegisterBenchmarkRequest'
        required: true
  /v1/openai/v1/chat/completions:
    get:
      responses:
        '200':
          description: A ListOpenAIChatCompletionResponse.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListOpenAIChatCompletionResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: List all chat completions.
      parameters:
        - name: after
          in: query
          description: >-
            The ID of the last chat completion to return.
          required: false
          schema:
            type: string
        - name: limit
          in: query
          description: >-
            The maximum number of chat completions to return.
          required: false
          schema:
            type: integer
        - name: model
          in: query
          description: The model to filter by.
          required: false
          schema:
            type: string
        - name: order
          in: query
          description: >-
            The order to sort the chat completions by: "asc" or "desc". Defaults to
            "desc".
          required: false
          schema:
            $ref: '#/components/schemas/Order'
    post:
      responses:
        '200':
          description: An OpenAIChatCompletion.
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/OpenAIChatCompletion'
                  - $ref: '#/components/schemas/OpenAIChatCompletionChunk'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: >-
        Generate an OpenAI-compatible chat completion for the given messages using
        the specified model.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/OpenaiChatCompletionRequest'
        required: true
  /v1/datasets:
    get:
      responses:
@ -2261,39 +2373,6 @@ paths:
            schema:
              $ref: '#/components/schemas/LogEventRequest'
        required: true
  /v1/openai/v1/chat/completions:
    post:
      responses:
        '200':
          description: An OpenAIChatCompletion.
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/OpenAIChatCompletion'
                  - $ref: '#/components/schemas/OpenAIChatCompletionChunk'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
      description: >-
        Generate an OpenAI-compatible chat completion for the given messages using
        the specified model.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/OpenaiChatCompletionRequest'
        required: true
  /v1/openai/v1/completions:
    post:
      responses:
@ -5479,6 +5558,369 @@ components:
        - scoring_functions
        - metadata
      title: Benchmark
    OpenAIAssistantMessageParam:
      type: object
      properties:
        role:
          type: string
          const: assistant
          default: assistant
          description: >-
            Must be "assistant" to identify this as the model's response
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The content of the model's response
        name:
          type: string
          description: >-
            (Optional) The name of the assistant message participant.
        tool_calls:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIChatCompletionToolCall'
          description: >-
            List of tool calls. Each tool call is an OpenAIChatCompletionToolCall
            object.
      additionalProperties: false
      required:
        - role
      title: OpenAIAssistantMessageParam
      description: >-
        A message containing the model's (assistant) response in an OpenAI-compatible
        chat completion request.
    "OpenAIChatCompletionContentPartImageParam":
      type: object
      properties:
        type:
          type: string
          const: image_url
          default: image_url
        image_url:
          $ref: '#/components/schemas/OpenAIImageURL'
      additionalProperties: false
      required:
        - type
        - image_url
      title: >-
        OpenAIChatCompletionContentPartImageParam
    OpenAIChatCompletionContentPartParam:
      oneOf:
        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
      discriminator:
        propertyName: type
        mapping:
          text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
          image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
    OpenAIChatCompletionContentPartTextParam:
      type: object
      properties:
        type:
          type: string
          const: text
          default: text
        text:
          type: string
      additionalProperties: false
      required:
        - type
        - text
      title: OpenAIChatCompletionContentPartTextParam
    OpenAIChatCompletionToolCall:
      type: object
      properties:
        index:
          type: integer
        id:
          type: string
        type:
          type: string
          const: function
          default: function
        function:
          $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction'
      additionalProperties: false
      required:
        - type
      title: OpenAIChatCompletionToolCall
    OpenAIChatCompletionToolCallFunction:
      type: object
      properties:
        name:
          type: string
        arguments:
          type: string
      additionalProperties: false
      title: OpenAIChatCompletionToolCallFunction
    OpenAIChoice:
      type: object
      properties:
        message:
          $ref: '#/components/schemas/OpenAIMessageParam'
          description: The message from the model
        finish_reason:
          type: string
          description: The reason the model stopped generating
        index:
          type: integer
          description: The index of the choice
        logprobs:
          $ref: '#/components/schemas/OpenAIChoiceLogprobs'
          description: >-
            (Optional) The log probabilities for the tokens in the message
      additionalProperties: false
      required:
        - message
        - finish_reason
        - index
      title: OpenAIChoice
      description: >-
        A choice from an OpenAI-compatible chat completion response.
    OpenAIChoiceLogprobs:
      type: object
      properties:
        content:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITokenLogProb'
          description: >-
            (Optional) The log probabilities for the tokens in the message
        refusal:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITokenLogProb'
          description: >-
            (Optional) The log probabilities for the tokens in the message
      additionalProperties: false
      title: OpenAIChoiceLogprobs
      description: >-
        The log probabilities for the tokens in the message from an OpenAI-compatible
        chat completion response.
    OpenAIDeveloperMessageParam:
      type: object
      properties:
        role:
          type: string
          const: developer
          default: developer
          description: >-
            Must be "developer" to identify this as a developer message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The content of the developer message
        name:
          type: string
          description: >-
            (Optional) The name of the developer message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIDeveloperMessageParam
      description: >-
        A message from the developer in an OpenAI-compatible chat completion request.
    OpenAIImageURL:
      type: object
      properties:
        url:
          type: string
        detail:
          type: string
      additionalProperties: false
      required:
        - url
      title: OpenAIImageURL
    OpenAIMessageParam:
      oneOf:
        - $ref: '#/components/schemas/OpenAIUserMessageParam'
        - $ref: '#/components/schemas/OpenAISystemMessageParam'
        - $ref: '#/components/schemas/OpenAIAssistantMessageParam'
        - $ref: '#/components/schemas/OpenAIToolMessageParam'
        - $ref: '#/components/schemas/OpenAIDeveloperMessageParam'
      discriminator:
        propertyName: role
        mapping:
          user: '#/components/schemas/OpenAIUserMessageParam'
          system: '#/components/schemas/OpenAISystemMessageParam'
          assistant: '#/components/schemas/OpenAIAssistantMessageParam'
          tool: '#/components/schemas/OpenAIToolMessageParam'
          developer: '#/components/schemas/OpenAIDeveloperMessageParam'
    OpenAISystemMessageParam:
      type: object
      properties:
        role:
          type: string
          const: system
          default: system
          description: >-
            Must be "system" to identify this as a system message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: >-
            The content of the "system prompt". If multiple system messages are provided,
            they are concatenated. The underlying Llama Stack code may also add other
            system messages (for example, for formatting tool definitions).
        name:
          type: string
          description: >-
            (Optional) The name of the system message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAISystemMessageParam
      description: >-
        A system message providing instructions or context to the model.
    OpenAITokenLogProb:
      type: object
      properties:
        token:
          type: string
        bytes:
          type: array
          items:
            type: integer
        logprob:
          type: number
        top_logprobs:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITopLogProb'
      additionalProperties: false
      required:
        - token
        - logprob
        - top_logprobs
      title: OpenAITokenLogProb
      description: >-
        The log probability for a token from an OpenAI-compatible chat completion
        response.
    OpenAIToolMessageParam:
      type: object
      properties:
        role:
          type: string
          const: tool
          default: tool
          description: >-
            Must be "tool" to identify this as a tool response
        tool_call_id:
          type: string
          description: >-
            Unique identifier for the tool call this response is for
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The response content from the tool
      additionalProperties: false
      required:
        - role
        - tool_call_id
        - content
      title: OpenAIToolMessageParam
      description: >-
        A message representing the result of a tool invocation in an OpenAI-compatible
        chat completion request.
    OpenAITopLogProb:
      type: object
      properties:
        token:
          type: string
        bytes:
          type: array
          items:
            type: integer
        logprob:
          type: number
      additionalProperties: false
      required:
        - token
        - logprob
      title: OpenAITopLogProb
      description: >-
        The top log probability for a token from an OpenAI-compatible chat completion
        response.
    OpenAIUserMessageParam:
      type: object
      properties:
        role:
          type: string
          const: user
          default: user
          description: >-
            Must be "user" to identify this as a user message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: >-
            The content of the message, which can include text and other media
        name:
          type: string
          description: >-
            (Optional) The name of the user message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIUserMessageParam
      description: >-
        A message from the user in an OpenAI-compatible chat completion request.
    OpenAICompletionWithInputMessages:
      type: object
      properties:
        id:
          type: string
          description: The ID of the chat completion
        choices:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIChoice'
          description: List of choices
        object:
          type: string
          const: chat.completion
          default: chat.completion
          description: >-
            The object type, which will be "chat.completion"
        created:
          type: integer
          description: >-
            The Unix timestamp in seconds when the chat completion was created
        model:
          type: string
          description: >-
            The model that was used to generate the chat completion
        input_messages:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIMessageParam'
      additionalProperties: false
      required:
        - id
        - choices
        - object
        - created
        - model
        - input_messages
      title: OpenAICompletionWithInputMessages
    DataSource:
      oneOf:
        - $ref: '#/components/schemas/URIDataSource'
@ -6497,6 +6939,73 @@ components:
      required:
        - data
      title: ListBenchmarksResponse
    Order:
      type: string
      enum:
        - asc
        - desc
      title: Order
    ListOpenAIChatCompletionResponse:
      type: object
      properties:
        data:
          type: array
          items:
            type: object
            properties:
              id:
                type: string
                description: The ID of the chat completion
              choices:
                type: array
                items:
                  $ref: '#/components/schemas/OpenAIChoice'
                description: List of choices
              object:
                type: string
                const: chat.completion
                default: chat.completion
                description: >-
                  The object type, which will be "chat.completion"
              created:
                type: integer
                description: >-
                  The Unix timestamp in seconds when the chat completion was created
              model:
                type: string
                description: >-
                  The model that was used to generate the chat completion
              input_messages:
                type: array
                items:
                  $ref: '#/components/schemas/OpenAIMessageParam'
            additionalProperties: false
            required:
              - id
              - choices
              - object
              - created
              - model
              - input_messages
            title: OpenAICompletionWithInputMessages
        has_more:
          type: boolean
        first_id:
          type: string
        last_id:
          type: string
        object:
          type: string
          const: list
          default: list
      additionalProperties: false
      required:
        - data
        - has_more
        - first_id
        - last_id
        - object
      title: ListOpenAIChatCompletionResponse
    ListDatasetsResponse:
      type: object
      properties:
@ -6835,142 +7344,6 @@ components:
        - event
        - ttl_seconds
      title: LogEventRequest
    OpenAIAssistantMessageParam:
      type: object
      properties:
        role:
          type: string
          const: assistant
          default: assistant
          description: >-
            Must be "assistant" to identify this as the model's response
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The content of the model's response
        name:
          type: string
          description: >-
            (Optional) The name of the assistant message participant.
        tool_calls:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIChatCompletionToolCall'
          description: >-
            List of tool calls. Each tool call is an OpenAIChatCompletionToolCall
            object.
      additionalProperties: false
      required:
        - role
      title: OpenAIAssistantMessageParam
      description: >-
        A message containing the model's (assistant) response in an OpenAI-compatible
        chat completion request.
    "OpenAIChatCompletionContentPartImageParam":
      type: object
      properties:
        type:
          type: string
          const: image_url
          default: image_url
        image_url:
          $ref: '#/components/schemas/OpenAIImageURL'
      additionalProperties: false
      required:
        - type
        - image_url
      title: >-
        OpenAIChatCompletionContentPartImageParam
    OpenAIChatCompletionContentPartParam:
      oneOf:
        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
      discriminator:
        propertyName: type
        mapping:
          text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
          image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
    OpenAIChatCompletionContentPartTextParam:
      type: object
      properties:
        type:
          type: string
          const: text
          default: text
        text:
          type: string
      additionalProperties: false
      required:
        - type
        - text
      title: OpenAIChatCompletionContentPartTextParam
    OpenAIChatCompletionToolCall:
      type: object
      properties:
        index:
          type: integer
        id:
          type: string
        type:
          type: string
          const: function
          default: function
        function:
          $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction'
      additionalProperties: false
      required:
        - type
      title: OpenAIChatCompletionToolCall
    OpenAIChatCompletionToolCallFunction:
      type: object
      properties:
        name:
          type: string
        arguments:
          type: string
      additionalProperties: false
      title: OpenAIChatCompletionToolCallFunction
    OpenAIDeveloperMessageParam:
      type: object
      properties:
        role:
          type: string
          const: developer
          default: developer
          description: >-
            Must be "developer" to identify this as a developer message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The content of the developer message
        name:
          type: string
          description: >-
            (Optional) The name of the developer message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIDeveloperMessageParam
      description: >-
        A message from the developer in an OpenAI-compatible chat completion request.
    OpenAIImageURL:
      type: object
      properties:
        url:
          type: string
        detail:
          type: string
      additionalProperties: false
      required:
        - url
      title: OpenAIImageURL
    OpenAIJSONSchema:
      type: object
      properties:
@ -6994,21 +7367,6 @@ components:
      required:
        - name
      title: OpenAIJSONSchema
    OpenAIMessageParam:
      oneOf:
        - $ref: '#/components/schemas/OpenAIUserMessageParam'
        - $ref: '#/components/schemas/OpenAISystemMessageParam'
        - $ref: '#/components/schemas/OpenAIAssistantMessageParam'
        - $ref: '#/components/schemas/OpenAIToolMessageParam'
        - $ref: '#/components/schemas/OpenAIDeveloperMessageParam'
      discriminator:
        propertyName: role
        mapping:
          user: '#/components/schemas/OpenAIUserMessageParam'
          system: '#/components/schemas/OpenAISystemMessageParam'
          assistant: '#/components/schemas/OpenAIAssistantMessageParam'
          tool: '#/components/schemas/OpenAIToolMessageParam'
          developer: '#/components/schemas/OpenAIDeveloperMessageParam'
    OpenAIResponseFormatJSONObject:
      type: object
      properties:
@ -7056,93 +7414,6 @@ components:
      required:
        - type
      title: OpenAIResponseFormatText
    OpenAISystemMessageParam:
      type: object
      properties:
        role:
          type: string
          const: system
          default: system
          description: >-
            Must be "system" to identify this as a system message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: >-
            The content of the "system prompt". If multiple system messages are provided,
            they are concatenated. The underlying Llama Stack code may also add other
            system messages (for example, for formatting tool definitions).
        name:
          type: string
          description: >-
            (Optional) The name of the system message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAISystemMessageParam
      description: >-
        A system message providing instructions or context to the model.
    OpenAIToolMessageParam:
      type: object
      properties:
        role:
          type: string
          const: tool
          default: tool
          description: >-
            Must be "tool" to identify this as a tool response
        tool_call_id:
          type: string
          description: >-
            Unique identifier for the tool call this response is for
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: The response content from the tool
      additionalProperties: false
      required:
        - role
        - tool_call_id
        - content
      title: OpenAIToolMessageParam
      description: >-
        A message representing the result of a tool invocation in an OpenAI-compatible
        chat completion request.
    OpenAIUserMessageParam:
      type: object
      properties:
        role:
          type: string
          const: user
          default: user
          description: >-
            Must be "user" to identify this as a user message
        content:
          oneOf:
            - type: string
            - type: array
              items:
                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
          description: >-
            The content of the message, which can include text and other media
        name:
          type: string
          description: >-
            (Optional) The name of the user message participant.
      additionalProperties: false
      required:
        - role
        - content
      title: OpenAIUserMessageParam
      description: >-
        A message from the user in an OpenAI-compatible chat completion request.
    OpenaiChatCompletionRequest:
      type: object
      properties:
@ -7356,30 +7627,6 @@ components:
      title: OpenAIChatCompletionChunk
      description: >-
        Chunk from a streaming response to an OpenAI-compatible chat completion request.
    OpenAIChoice:
      type: object
      properties:
        message:
          $ref: '#/components/schemas/OpenAIMessageParam'
          description: The message from the model
        finish_reason:
          type: string
          description: The reason the model stopped generating
        index:
          type: integer
          description: The index of the choice
        logprobs:
          $ref: '#/components/schemas/OpenAIChoiceLogprobs'
          description: >-
            (Optional) The log probabilities for the tokens in the message
      additionalProperties: false
      required:
        - message
        - finish_reason
        - index
      title: OpenAIChoice
      description: >-
        A choice from an OpenAI-compatible chat completion response.
    OpenAIChoiceDelta:
      type: object
      properties:
@ -7401,26 +7648,6 @@ components:
      title: OpenAIChoiceDelta
      description: >-
        A delta from an OpenAI-compatible chat completion streaming response.
    OpenAIChoiceLogprobs:
      type: object
      properties:
        content:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITokenLogProb'
          description: >-
            (Optional) The log probabilities for the tokens in the message
        refusal:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITokenLogProb'
          description: >-
            (Optional) The log probabilities for the tokens in the message
      additionalProperties: false
      title: OpenAIChoiceLogprobs
      description: >-
        The log probabilities for the tokens in the message from an OpenAI-compatible
        chat completion response.
    OpenAIChunkChoice:
      type: object
      properties:
@ -7445,49 +7672,6 @@ components:
      title: OpenAIChunkChoice
      description: >-
        A chunk choice from an OpenAI-compatible chat completion streaming response.
    OpenAITokenLogProb:
      type: object
      properties:
        token:
          type: string
        bytes:
          type: array
          items:
            type: integer
        logprob:
          type: number
        top_logprobs:
          type: array
          items:
            $ref: '#/components/schemas/OpenAITopLogProb'
      additionalProperties: false
      required:
        - token
        - logprob
        - top_logprobs
      title: OpenAITokenLogProb
      description: >-
        The log probability for a token from an OpenAI-compatible chat completion
        response.
    OpenAITopLogProb:
      type: object
      properties:
        token:
          type: string
        bytes:
          type: array
          items:
            type: integer
        logprob:
          type: number
      additionalProperties: false
      required:
        - token
        - logprob
      title: OpenAITopLogProb
      description: >-
        The top log probability for a token from an OpenAI-compatible chat completion
        response.
    OpenaiCompletionRequest:
      type: object
      properties:
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -759,7 +759,7 @@ class Generator:
        )
        return Operation(
-            tags=[op.defining_class.__name__],
+            tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
            summary=None,
            # summary=doc_string.short_description,
            description=description,
@ -805,6 +805,8 @@ class Generator:
        operation_tags: List[Tag] = []
        for cls in endpoint_classes:
            doc_string = parse_type(cls)
            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
                continue
            operation_tags.append(
                Tag(
                    name=cls.__name__,
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -820,15 +820,32 @@ class BatchChatCompletionResponse(BaseModel):
    batch: list[ChatCompletionResponse]
 class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
    input_messages: list[OpenAIMessageParam]
@json_schema_type
 class ListOpenAIChatCompletionResponse(BaseModel):
    data: list[OpenAICompletionWithInputMessages]
    has_more: bool
    first_id: str
    last_id: str
    object: Literal["list"] = "list"
 class Order(Enum):
    asc = "asc"
    desc = "desc"
@runtime_checkable
@trace_protocol
-class Inference(Protocol):
+class InferenceProvider(Protocol):
    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
    - Embedding models: these models generate embeddings to be used for semantic search.
    """
    This protocol defines the interface that should be implemented by all inference providers.
    """
    API_NAMESPACE: str = "Inference"
    model_store: ModelStore | None = None
@ -1062,3 +1079,39 @@ class Inference(Protocol):
        :returns: An OpenAIChatCompletion.
        """
        ...
 class Inference(InferenceProvider):
    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
    - Embedding models: these models generate embeddings to be used for semantic search.
    """
    @webmethod(route="/openai/v1/chat/completions", method="GET")
    async def list_chat_completions(
        self,
        after: str | None = None,
        limit: int | None = 20,
        model: str | None = None,
        order: Order | None = Order.desc,
    ) -> ListOpenAIChatCompletionResponse:
        """List all chat completions.
        :param after: The ID of the last chat completion to return.
        :param limit: The maximum number of chat completions to return.
        :param model: The model to filter by.
        :param order: The order to sort the chat completions by: "asc" or "desc". Defaults to "desc".
        :returns: A ListOpenAIChatCompletionResponse.
        """
        raise NotImplementedError("List chat completions is not implemented")
    @webmethod(route="/openai/v1/chat/completions/{completion_id}", method="GET")
    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
        """Describe a chat completion by its ID.
        :param completion_id: ID of the chat completion.
        :returns: A OpenAICompletionWithInputMessages.
        """
        raise NotImplementedError("Get chat completion is not implemented")
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -12,6 +12,7 @@ import shutil
 import sys
 import textwrap
 from functools import lru_cache
 from importlib.abc import Traversable
 from pathlib import Path
 import yaml
@ -250,11 +251,10 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
        sys.exit(1)
    if args.run:
        run_config = Path(run_config)
        config_dict = yaml.safe_load(run_config.read_text())
        config = parse_and_maybe_upgrade_config(config_dict)
-        if not os.path.exists(str(config.external_providers_dir)):
+        if not os.path.exists(config.external_providers_dir):
-            os.makedirs(str(config.external_providers_dir), exist_ok=True)
+            os.makedirs(config.external_providers_dir, exist_ok=True)
        run_args = formulate_run_args(args.image_type, args.image_name, config, args.template)
        run_args.extend([str(os.getenv("LLAMA_STACK_PORT", 8321)), "--config", run_config])
        run_command(run_args)
@ -264,7 +264,7 @@ def _generate_run_config(
    build_config: BuildConfig,
    build_dir: Path,
    image_name: str,
-) -> str:
+) -> Path:
    """
    Generate a run.yaml template file for user to edit from a build.yaml file
    """
@ -343,7 +343,7 @@ def _run_stack_build_command_from_build_config(
    image_name: str | None = None,
    template_name: str | None = None,
    config_path: str | None = None,
-) -> str:
+) -> Path | Traversable:
    image_name = image_name or build_config.image_name
    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
        if template_name:
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -340,8 +340,17 @@ class BuildConfig(BaseModel):
        default=None,
        description="Name of the distribution to build",
    )
-    external_providers_dir: str | None = Field(
+    external_providers_dir: Path | None = Field(
        default=None,
        description="Path to directory containing external provider implementations. The providers packages will be resolved from this directory. "
        "pip_packages MUST contain the provider package name.",
    )
    @field_validator("external_providers_dir")
    @classmethod
    def validate_external_providers_dir(cls, v):
        if v is None:
            return None
        if isinstance(v, str):
            return Path(v)
        return v
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@ -226,6 +226,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                    distribution_spec=DistributionSpec(
                        providers=provider_types,
                    ),
                    external_providers_dir=self.config.external_providers_dir,
                )
                print_pip_install_help(build_config)
            else:
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -13,7 +13,7 @@ from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
 from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import Inference, InferenceProvider
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
@ -83,6 +83,13 @@ def api_protocol_map() -> dict[Api, Any]:
    }
 def api_protocol_map_for_compliance_check() -> dict[Api, Any]:
    return {
        **api_protocol_map(),
        Api.inference: InferenceProvider,
    }
 def additional_protocols_map() -> dict[Api, Any]:
    return {
        Api.inference: (ModelsProtocolPrivate, Models, Api.models),
@ -302,9 +309,6 @@ async def instantiate_provider(
    inner_impls: dict[str, Any],
    dist_registry: DistributionRegistry,
 ):
    protocols = api_protocol_map()
    additional_protocols = additional_protocols_map()
    provider_spec = provider.spec
    if not hasattr(provider_spec, "module"):
        raise AttributeError(f"ProviderSpec of type {type(provider_spec)} does not have a 'module' attribute")
@ -342,6 +346,8 @@ async def instantiate_provider(
    impl.__provider_spec__ = provider_spec
    impl.__provider_config__ = config
    protocols = api_protocol_map_for_compliance_check()
    additional_protocols = additional_protocols_map()
    # TODO: check compliance for special tool groups
    # the impl should be for Api.tool_runtime, the name should be the special tool group, the protocol should be the special tool group protocol
    check_protocol_compliance(impl, protocols[provider_spec.api])
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -280,7 +280,18 @@ class TracingMiddleware:
            logger.debug(f"No matching endpoint found for path: {path}, falling back to FastAPI")
            return await self.app(scope, receive, send)
-        trace_context = await start_trace(trace_path, {"__location__": "server", "raw_path": path})
+        trace_attributes = {"__location__": "server", "raw_path": path}
        # Extract W3C trace context headers and store as trace attributes
        headers = dict(scope.get("headers", []))
        traceparent = headers.get(b"traceparent", b"").decode()
        if traceparent:
            trace_attributes["traceparent"] = traceparent
        tracestate = headers.get(b"tracestate", b"").decode()
        if tracestate:
            trace_attributes["tracestate"] = tracestate
        trace_context = await start_trace(trace_path, trace_attributes)
        async def send_with_trace_id(message):
            if message["type"] == "http.response.start":
@ -370,14 +381,6 @@ def main(args: argparse.Namespace | None = None):
    if args is None:
        args = parser.parse_args()
    # Check for deprecated argument usage
    if "--config" in sys.argv:
        warnings.warn(
            "The '--config' argument is deprecated and will be removed in a future version. Use '--config' instead.",
            DeprecationWarning,
            stacklevel=2,
        )
    log_line = ""
    if args.config:
        # if the user provided a config file, use it, even if template was specified
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
    CompletionRequest,
    CompletionResponse,
    CompletionResponseStreamChunk,
-    Inference,
+    InferenceProvider,
    InterleavedContent,
    LogProbConfig,
    Message,
@ -86,7 +86,7 @@ class MetaReferenceInferenceImpl(
    OpenAICompletionToLlamaStackMixin,
    OpenAIChatCompletionToLlamaStackMixin,
    SentenceTransformerEmbeddingMixin,
-    Inference,
+    InferenceProvider,
    ModelsProtocolPrivate,
 ):
    def __init__(self, config: MetaReferenceInferenceConfig) -> None:
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -9,7 +9,7 @@ from collections.abc import AsyncGenerator
 from llama_stack.apis.inference import (
    CompletionResponse,
-    Inference,
+    InferenceProvider,
    InterleavedContent,
    LogProbConfig,
    Message,
@ -38,7 +38,7 @@ class SentenceTransformersInferenceImpl(
    OpenAIChatCompletionToLlamaStackMixin,
    OpenAICompletionToLlamaStackMixin,
    SentenceTransformerEmbeddingMixin,
-    Inference,
+    InferenceProvider,
    ModelsProtocolPrivate,
 ):
    def __init__(self, config: SentenceTransformersInferenceConfig) -> None:
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@ -16,6 +16,7 @@ from opentelemetry.sdk.resources import Resource
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.semconv.resource import ResourceAttributes
 from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
 from llama_stack.apis.telemetry import (
    Event,
@ -44,6 +45,7 @@ from llama_stack.providers.inline.telemetry.meta_reference.sqlite_span_processor
 )
 from llama_stack.providers.utils.telemetry.dataset_mixin import TelemetryDatasetMixin
 from llama_stack.providers.utils.telemetry.sqlite_trace_store import SQLiteTraceStore
 from llama_stack.providers.utils.telemetry.tracing import ROOT_SPAN_MARKERS
 from .config import TelemetryConfig, TelemetrySink
@ -206,6 +208,15 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                event.attributes = {}
            event.attributes["__ttl__"] = ttl_seconds
            # Extract these W3C trace context attributes so they are not written to
            # underlying storage, as we just need them to propagate the trace context.
            traceparent = event.attributes.pop("traceparent", None)
            tracestate = event.attributes.pop("tracestate", None)
            if traceparent:
                # If we have a traceparent header value, we're not the root span.
                for root_attribute in ROOT_SPAN_MARKERS:
                    event.attributes.pop(root_attribute, None)
            if isinstance(event.payload, SpanStartPayload):
                # Check if span already exists to prevent duplicates
                if span_id in _GLOBAL_STORAGE["active_spans"]:
@ -216,8 +227,12 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                    parent_span_id = int(event.payload.parent_span_id, 16)
                    parent_span = _GLOBAL_STORAGE["active_spans"].get(parent_span_id)
                    context = trace.set_span_in_context(parent_span)
-                else:
+                elif traceparent:
-                    event.attributes["__root_span__"] = "true"
+                    carrier = {
                        "traceparent": traceparent,
                        "tracestate": tracestate,
                    }
                    context = TraceContextTextMapPropagator().extract(carrier=carrier)
                span = tracer.start_span(
                    name=event.payload.name,
--- a/llama_stack/providers/remote/inference/cerebras_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import CerebrasCompatConfig
-async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .cerebras import CerebrasCompatInferenceAdapter
--- a/llama_stack/providers/remote/inference/fireworks_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import FireworksCompatConfig
-async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .fireworks import FireworksCompatInferenceAdapter
--- a/llama_stack/providers/remote/inference/groq_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/groq_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import GroqCompatConfig
-async def get_adapter_impl(config: GroqCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: GroqCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .groq import GroqCompatInferenceAdapter
--- a/llama_stack/providers/remote/inference/llama_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import LlamaCompatConfig
-async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .llama import LlamaCompatInferenceAdapter
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
    EmbeddingsResponse,
    EmbeddingTaskType,
    GrammarResponseFormat,
-    Inference,
+    InferenceProvider,
    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
@ -82,7 +82,7 @@ logger = get_logger(name=__name__, category="inference")
 class OllamaInferenceAdapter(
-    Inference,
+    InferenceProvider,
    ModelsProtocolPrivate,
 ):
    def __init__(self, url: str) -> None:
--- a/llama_stack/providers/remote/inference/sambanova_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import SambaNovaCompatConfig
-async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .sambanova import SambaNovaCompatInferenceAdapter
--- a/llama_stack/providers/remote/inference/together_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/together_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider
 from .config import TogetherCompatConfig
-async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .together import TogetherCompatInferenceAdapter
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -19,7 +19,7 @@ from llama_stack.apis.inference import (
    ChatCompletionResponseStreamChunk,
    EmbeddingsResponse,
    EmbeddingTaskType,
-    Inference,
+    InferenceProvider,
    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
@ -59,7 +59,7 @@ logger = get_logger(name=__name__, category="inference")
 class LiteLLMOpenAIMixin(
    ModelRegistryHelper,
-    Inference,
+    InferenceProvider,
    NeedsRequestProviderData,
 ):
    # TODO: avoid exposing the litellm specific model names to the user.
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@ -34,6 +34,8 @@ logger = get_logger(__name__, category="core")
 INVALID_SPAN_ID = 0x0000000000000000
 INVALID_TRACE_ID = 0x00000000000000000000000000000000
 ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]
 def trace_id_to_str(trace_id: int) -> str:
    """Convenience trace ID formatting method
@ -178,7 +180,8 @@ async def start_trace(name: str, attributes: dict[str, Any] = None) -> TraceCont
    trace_id = generate_trace_id()
    context = TraceContext(BACKGROUND_LOGGER, trace_id)
-    context.push_span(name, {"__root__": True, **(attributes or {})})
+    attributes = {marker: True for marker in ROOT_SPAN_MARKERS} | (attributes or {})
    context.push_span(name, attributes)
    CURRENT_TRACE_CONTEXT.set(context)
    return context