Merge branch 'main' into patch-metadata

2025-12-27 15:28:05 +00:00 · 2025-05-20 03:08:53 -06:00 · 2025-05-20 03:08:53 -06:00 · f0a142f5a8
commit f0a142f5a8
parent 5a807da6af 6d20b720b8
21 changed files with 1405 additions and 887 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -827,6 +827,35 @@ paths:
          required: true
          schema:
            type: string
+  /v1/openai/v1/chat/completions/{completion_id}:
+    get:
+      responses:
+        '200':
+          description: A OpenAICompletionWithInputMessages.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/OpenAICompletionWithInputMessages'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Inference
+      description: Describe a chat completion by its ID.
+      parameters:
+        - name: completion_id
+          in: path
+          description: ID of the chat completion.
+          required: true
+          schema:
+            type: string
  /v1/datasets/{dataset_id}:
    get:
      responses:
@ -1795,6 +1824,89 @@ paths:
            schema:
              $ref: '#/components/schemas/RegisterBenchmarkRequest'
        required: true
+  /v1/openai/v1/chat/completions:
+    get:
+      responses:
+        '200':
+          description: A ListOpenAIChatCompletionResponse.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListOpenAIChatCompletionResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Inference
+      description: List all chat completions.
+      parameters:
+        - name: after
+          in: query
+          description: >-
+            The ID of the last chat completion to return.
+          required: false
+          schema:
+            type: string
+        - name: limit
+          in: query
+          description: >-
+            The maximum number of chat completions to return.
+          required: false
+          schema:
+            type: integer
+        - name: model
+          in: query
+          description: The model to filter by.
+          required: false
+          schema:
+            type: string
+        - name: order
+          in: query
+          description: >-
+            The order to sort the chat completions by: "asc" or "desc". Defaults to
+            "desc".
+          required: false
+          schema:
+            $ref: '#/components/schemas/Order'
+    post:
+      responses:
+        '200':
+          description: An OpenAIChatCompletion.
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: '#/components/schemas/OpenAIChatCompletion'
+                  - $ref: '#/components/schemas/OpenAIChatCompletionChunk'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Inference
+      description: >-
+        Generate an OpenAI-compatible chat completion for the given messages using
+        the specified model.
+      parameters: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/OpenaiChatCompletionRequest'
+        required: true
  /v1/datasets:
    get:
      responses:
@ -2261,39 +2373,6 @@ paths:
            schema:
              $ref: '#/components/schemas/LogEventRequest'
        required: true
-  /v1/openai/v1/chat/completions:
-    post:
-      responses:
-        '200':
-          description: An OpenAIChatCompletion.
-          content:
-            application/json:
-              schema:
-                oneOf:
-                  - $ref: '#/components/schemas/OpenAIChatCompletion'
-                  - $ref: '#/components/schemas/OpenAIChatCompletionChunk'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      description: >-
-        Generate an OpenAI-compatible chat completion for the given messages using
-        the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/OpenaiChatCompletionRequest'
-        required: true
  /v1/openai/v1/completions:
    post:
      responses:
@ -5479,6 +5558,369 @@ components:
        - scoring_functions
        - metadata
      title: Benchmark
+    OpenAIAssistantMessageParam:
+      type: object
+      properties:
+        role:
+          type: string
+          const: assistant
+          default: assistant
+          description: >-
+            Must be "assistant" to identify this as the model's response
+        content:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
+          description: The content of the model's response
+        name:
+          type: string
+          description: >-
+            (Optional) The name of the assistant message participant.
+        tool_calls:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIChatCompletionToolCall'
+          description: >-
+            List of tool calls. Each tool call is an OpenAIChatCompletionToolCall
+            object.
+      additionalProperties: false
+      required:
+        - role
+      title: OpenAIAssistantMessageParam
+      description: >-
+        A message containing the model's (assistant) response in an OpenAI-compatible
+        chat completion request.
+    "OpenAIChatCompletionContentPartImageParam":
+      type: object
+      properties:
+        type:
+          type: string
+          const: image_url
+          default: image_url
+        image_url:
+          $ref: '#/components/schemas/OpenAIImageURL'
+      additionalProperties: false
+      required:
+        - type
+        - image_url
+      title: >-
+        OpenAIChatCompletionContentPartImageParam
+    OpenAIChatCompletionContentPartParam:
+      oneOf:
+        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
+        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
+      discriminator:
+        propertyName: type
+        mapping:
+          text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
+          image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
+    OpenAIChatCompletionContentPartTextParam:
+      type: object
+      properties:
+        type:
+          type: string
+          const: text
+          default: text
+        text:
+          type: string
+      additionalProperties: false
+      required:
+        - type
+        - text
+      title: OpenAIChatCompletionContentPartTextParam
+    OpenAIChatCompletionToolCall:
+      type: object
+      properties:
+        index:
+          type: integer
+        id:
+          type: string
+        type:
+          type: string
+          const: function
+          default: function
+        function:
+          $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction'
+      additionalProperties: false
+      required:
+        - type
+      title: OpenAIChatCompletionToolCall
+    OpenAIChatCompletionToolCallFunction:
+      type: object
+      properties:
+        name:
+          type: string
+        arguments:
+          type: string
+      additionalProperties: false
+      title: OpenAIChatCompletionToolCallFunction
+    OpenAIChoice:
+      type: object
+      properties:
+        message:
+          $ref: '#/components/schemas/OpenAIMessageParam'
+          description: The message from the model
+        finish_reason:
+          type: string
+          description: The reason the model stopped generating
+        index:
+          type: integer
+          description: The index of the choice
+        logprobs:
+          $ref: '#/components/schemas/OpenAIChoiceLogprobs'
+          description: >-
+            (Optional) The log probabilities for the tokens in the message
+      additionalProperties: false
+      required:
+        - message
+        - finish_reason
+        - index
+      title: OpenAIChoice
+      description: >-
+        A choice from an OpenAI-compatible chat completion response.
+    OpenAIChoiceLogprobs:
+      type: object
+      properties:
+        content:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAITokenLogProb'
+          description: >-
+            (Optional) The log probabilities for the tokens in the message
+        refusal:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAITokenLogProb'
+          description: >-
+            (Optional) The log probabilities for the tokens in the message
+      additionalProperties: false
+      title: OpenAIChoiceLogprobs
+      description: >-
+        The log probabilities for the tokens in the message from an OpenAI-compatible
+        chat completion response.
+    OpenAIDeveloperMessageParam:
+      type: object
+      properties:
+        role:
+          type: string
+          const: developer
+          default: developer
+          description: >-
+            Must be "developer" to identify this as a developer message
+        content:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
+          description: The content of the developer message
+        name:
+          type: string
+          description: >-
+            (Optional) The name of the developer message participant.
+      additionalProperties: false
+      required:
+        - role
+        - content
+      title: OpenAIDeveloperMessageParam
+      description: >-
+        A message from the developer in an OpenAI-compatible chat completion request.
+    OpenAIImageURL:
+      type: object
+      properties:
+        url:
+          type: string
+        detail:
+          type: string
+      additionalProperties: false
+      required:
+        - url
+      title: OpenAIImageURL
+    OpenAIMessageParam:
+      oneOf:
+        - $ref: '#/components/schemas/OpenAIUserMessageParam'
+        - $ref: '#/components/schemas/OpenAISystemMessageParam'
+        - $ref: '#/components/schemas/OpenAIAssistantMessageParam'
+        - $ref: '#/components/schemas/OpenAIToolMessageParam'
+        - $ref: '#/components/schemas/OpenAIDeveloperMessageParam'
+      discriminator:
+        propertyName: role
+        mapping:
+          user: '#/components/schemas/OpenAIUserMessageParam'
+          system: '#/components/schemas/OpenAISystemMessageParam'
+          assistant: '#/components/schemas/OpenAIAssistantMessageParam'
+          tool: '#/components/schemas/OpenAIToolMessageParam'
+          developer: '#/components/schemas/OpenAIDeveloperMessageParam'
+    OpenAISystemMessageParam:
+      type: object
+      properties:
+        role:
+          type: string
+          const: system
+          default: system
+          description: >-
+            Must be "system" to identify this as a system message
+        content:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
+          description: >-
+            The content of the "system prompt". If multiple system messages are provided,
+            they are concatenated. The underlying Llama Stack code may also add other
+            system messages (for example, for formatting tool definitions).
+        name:
+          type: string
+          description: >-
+            (Optional) The name of the system message participant.
+      additionalProperties: false
+      required:
+        - role
+        - content
+      title: OpenAISystemMessageParam
+      description: >-
+        A system message providing instructions or context to the model.
+    OpenAITokenLogProb:
+      type: object
+      properties:
+        token:
+          type: string
+        bytes:
+          type: array
+          items:
+            type: integer
+        logprob:
+          type: number
+        top_logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAITopLogProb'
+      additionalProperties: false
+      required:
+        - token
+        - logprob
+        - top_logprobs
+      title: OpenAITokenLogProb
+      description: >-
+        The log probability for a token from an OpenAI-compatible chat completion
+        response.
+    OpenAIToolMessageParam:
+      type: object
+      properties:
+        role:
+          type: string
+          const: tool
+          default: tool
+          description: >-
+            Must be "tool" to identify this as a tool response
+        tool_call_id:
+          type: string
+          description: >-
+            Unique identifier for the tool call this response is for
+        content:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
+          description: The response content from the tool
+      additionalProperties: false
+      required:
+        - role
+        - tool_call_id
+        - content
+      title: OpenAIToolMessageParam
+      description: >-
+        A message representing the result of a tool invocation in an OpenAI-compatible
+        chat completion request.
+    OpenAITopLogProb:
+      type: object
+      properties:
+        token:
+          type: string
+        bytes:
+          type: array
+          items:
+            type: integer
+        logprob:
+          type: number
+      additionalProperties: false
+      required:
+        - token
+        - logprob
+      title: OpenAITopLogProb
+      description: >-
+        The top log probability for a token from an OpenAI-compatible chat completion
+        response.
+    OpenAIUserMessageParam:
+      type: object
+      properties:
+        role:
+          type: string
+          const: user
+          default: user
+          description: >-
+            Must be "user" to identify this as a user message
+        content:
+          oneOf:
+            - type: string
+            - type: array
+              items:
+                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
+          description: >-
+            The content of the message, which can include text and other media
+        name:
+          type: string
+          description: >-
+            (Optional) The name of the user message participant.
+      additionalProperties: false
+      required:
+        - role
+        - content
+      title: OpenAIUserMessageParam
+      description: >-
+        A message from the user in an OpenAI-compatible chat completion request.
+    OpenAICompletionWithInputMessages:
+      type: object
+      properties:
+        id:
+          type: string
+          description: The ID of the chat completion
+        choices:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIChoice'
+          description: List of choices
+        object:
+          type: string
+          const: chat.completion
+          default: chat.completion
+          description: >-
+            The object type, which will be "chat.completion"
+        created:
+          type: integer
+          description: >-
+            The Unix timestamp in seconds when the chat completion was created
+        model:
+          type: string
+          description: >-
+            The model that was used to generate the chat completion
+        input_messages:
+          type: array
+          items:
+            $ref: '#/components/schemas/OpenAIMessageParam'
+      additionalProperties: false
+      required:
+        - id
+        - choices
+        - object
+        - created
+        - model
+        - input_messages
+      title: OpenAICompletionWithInputMessages
    DataSource:
      oneOf:
        - $ref: '#/components/schemas/URIDataSource'
@ -6497,6 +6939,73 @@ components:
      required:
        - data
      title: ListBenchmarksResponse
+    Order:
+      type: string
+      enum:
+        - asc
+        - desc
+      title: Order
+    ListOpenAIChatCompletionResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            type: object
+            properties:
+              id:
+                type: string
+                description: The ID of the chat completion
+              choices:
+                type: array
+                items:
+                  $ref: '#/components/schemas/OpenAIChoice'
+                description: List of choices
+              object:
+                type: string
+                const: chat.completion
+                default: chat.completion
+                description: >-
+                  The object type, which will be "chat.completion"
+              created:
+                type: integer
+                description: >-
+                  The Unix timestamp in seconds when the chat completion was created
+              model:
+                type: string
+                description: >-
+                  The model that was used to generate the chat completion
+              input_messages:
+                type: array
+                items:
+                  $ref: '#/components/schemas/OpenAIMessageParam'
+            additionalProperties: false
+            required:
+              - id
+              - choices
+              - object
+              - created
+              - model
+              - input_messages
+            title: OpenAICompletionWithInputMessages
+        has_more:
+          type: boolean
+        first_id:
+          type: string
+        last_id:
+          type: string
+        object:
+          type: string
+          const: list
+          default: list
+      additionalProperties: false
+      required:
+        - data
+        - has_more
+        - first_id
+        - last_id
+        - object
+      title: ListOpenAIChatCompletionResponse
    ListDatasetsResponse:
      type: object
      properties:
@ -6835,142 +7344,6 @@ components:
        - event
        - ttl_seconds
      title: LogEventRequest
-    OpenAIAssistantMessageParam:
-      type: object
-      properties:
-        role:
-          type: string
-          const: assistant
-          default: assistant
-          description: >-
-            Must be "assistant" to identify this as the model's response
-        content:
-          oneOf:
-            - type: string
-            - type: array
-              items:
-                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
-          description: The content of the model's response
-        name:
-          type: string
-          description: >-
-            (Optional) The name of the assistant message participant.
-        tool_calls:
-          type: array
-          items:
-            $ref: '#/components/schemas/OpenAIChatCompletionToolCall'
-          description: >-
-            List of tool calls. Each tool call is an OpenAIChatCompletionToolCall
-            object.
-      additionalProperties: false
-      required:
-        - role
-      title: OpenAIAssistantMessageParam
-      description: >-
-        A message containing the model's (assistant) response in an OpenAI-compatible
-        chat completion request.
-    "OpenAIChatCompletionContentPartImageParam":
-      type: object
-      properties:
-        type:
-          type: string
-          const: image_url
-          default: image_url
-        image_url:
-          $ref: '#/components/schemas/OpenAIImageURL'
-      additionalProperties: false
-      required:
-        - type
-        - image_url
-      title: >-
-        OpenAIChatCompletionContentPartImageParam
-    OpenAIChatCompletionContentPartParam:
-      oneOf:
-        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
-        - $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
-      discriminator:
-        propertyName: type
-        mapping:
-          text: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
-          image_url: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
-    OpenAIChatCompletionContentPartTextParam:
-      type: object
-      properties:
-        type:
-          type: string
-          const: text
-          default: text
-        text:
-          type: string
-      additionalProperties: false
-      required:
-        - type
-        - text
-      title: OpenAIChatCompletionContentPartTextParam
-    OpenAIChatCompletionToolCall:
-      type: object
-      properties:
-        index:
-          type: integer
-        id:
-          type: string
-        type:
-          type: string
-          const: function
-          default: function
-        function:
-          $ref: '#/components/schemas/OpenAIChatCompletionToolCallFunction'
-      additionalProperties: false
-      required:
-        - type
-      title: OpenAIChatCompletionToolCall
-    OpenAIChatCompletionToolCallFunction:
-      type: object
-      properties:
-        name:
-          type: string
-        arguments:
-          type: string
-      additionalProperties: false
-      title: OpenAIChatCompletionToolCallFunction
-    OpenAIDeveloperMessageParam:
-      type: object
-      properties:
-        role:
-          type: string
-          const: developer
-          default: developer
-          description: >-
-            Must be "developer" to identify this as a developer message
-        content:
-          oneOf:
-            - type: string
-            - type: array
-              items:
-                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
-          description: The content of the developer message
-        name:
-          type: string
-          description: >-
-            (Optional) The name of the developer message participant.
-      additionalProperties: false
-      required:
-        - role
-        - content
-      title: OpenAIDeveloperMessageParam
-      description: >-
-        A message from the developer in an OpenAI-compatible chat completion request.
-    OpenAIImageURL:
-      type: object
-      properties:
-        url:
-          type: string
-        detail:
-          type: string
-      additionalProperties: false
-      required:
-        - url
-      title: OpenAIImageURL
    OpenAIJSONSchema:
      type: object
      properties:
@ -6994,21 +7367,6 @@ components:
      required:
        - name
      title: OpenAIJSONSchema
-    OpenAIMessageParam:
-      oneOf:
-        - $ref: '#/components/schemas/OpenAIUserMessageParam'
-        - $ref: '#/components/schemas/OpenAISystemMessageParam'
-        - $ref: '#/components/schemas/OpenAIAssistantMessageParam'
-        - $ref: '#/components/schemas/OpenAIToolMessageParam'
-        - $ref: '#/components/schemas/OpenAIDeveloperMessageParam'
-      discriminator:
-        propertyName: role
-        mapping:
-          user: '#/components/schemas/OpenAIUserMessageParam'
-          system: '#/components/schemas/OpenAISystemMessageParam'
-          assistant: '#/components/schemas/OpenAIAssistantMessageParam'
-          tool: '#/components/schemas/OpenAIToolMessageParam'
-          developer: '#/components/schemas/OpenAIDeveloperMessageParam'
    OpenAIResponseFormatJSONObject:
      type: object
      properties:
@ -7056,93 +7414,6 @@ components:
      required:
        - type
      title: OpenAIResponseFormatText
-    OpenAISystemMessageParam:
-      type: object
-      properties:
-        role:
-          type: string
-          const: system
-          default: system
-          description: >-
-            Must be "system" to identify this as a system message
-        content:
-          oneOf:
-            - type: string
-            - type: array
-              items:
-                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
-          description: >-
-            The content of the "system prompt". If multiple system messages are provided,
-            they are concatenated. The underlying Llama Stack code may also add other
-            system messages (for example, for formatting tool definitions).
-        name:
-          type: string
-          description: >-
-            (Optional) The name of the system message participant.
-      additionalProperties: false
-      required:
-        - role
-        - content
-      title: OpenAISystemMessageParam
-      description: >-
-        A system message providing instructions or context to the model.
-    OpenAIToolMessageParam:
-      type: object
-      properties:
-        role:
-          type: string
-          const: tool
-          default: tool
-          description: >-
-            Must be "tool" to identify this as a tool response
-        tool_call_id:
-          type: string
-          description: >-
-            Unique identifier for the tool call this response is for
-        content:
-          oneOf:
-            - type: string
-            - type: array
-              items:
-                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
-          description: The response content from the tool
-      additionalProperties: false
-      required:
-        - role
-        - tool_call_id
-        - content
-      title: OpenAIToolMessageParam
-      description: >-
-        A message representing the result of a tool invocation in an OpenAI-compatible
-        chat completion request.
-    OpenAIUserMessageParam:
-      type: object
-      properties:
-        role:
-          type: string
-          const: user
-          default: user
-          description: >-
-            Must be "user" to identify this as a user message
-        content:
-          oneOf:
-            - type: string
-            - type: array
-              items:
-                $ref: '#/components/schemas/OpenAIChatCompletionContentPartParam'
-          description: >-
-            The content of the message, which can include text and other media
-        name:
-          type: string
-          description: >-
-            (Optional) The name of the user message participant.
-      additionalProperties: false
-      required:
-        - role
-        - content
-      title: OpenAIUserMessageParam
-      description: >-
-        A message from the user in an OpenAI-compatible chat completion request.
    OpenaiChatCompletionRequest:
      type: object
      properties:
@ -7356,30 +7627,6 @@ components:
      title: OpenAIChatCompletionChunk
      description: >-
        Chunk from a streaming response to an OpenAI-compatible chat completion request.
-    OpenAIChoice:
-      type: object
-      properties:
-        message:
-          $ref: '#/components/schemas/OpenAIMessageParam'
-          description: The message from the model
-        finish_reason:
-          type: string
-          description: The reason the model stopped generating
-        index:
-          type: integer
-          description: The index of the choice
-        logprobs:
-          $ref: '#/components/schemas/OpenAIChoiceLogprobs'
-          description: >-
-            (Optional) The log probabilities for the tokens in the message
-      additionalProperties: false
-      required:
-        - message
-        - finish_reason
-        - index
-      title: OpenAIChoice
-      description: >-
-        A choice from an OpenAI-compatible chat completion response.
    OpenAIChoiceDelta:
      type: object
      properties:
@ -7401,26 +7648,6 @@ components:
      title: OpenAIChoiceDelta
      description: >-
        A delta from an OpenAI-compatible chat completion streaming response.
-    OpenAIChoiceLogprobs:
-      type: object
-      properties:
-        content:
-          type: array
-          items:
-            $ref: '#/components/schemas/OpenAITokenLogProb'
-          description: >-
-            (Optional) The log probabilities for the tokens in the message
-        refusal:
-          type: array
-          items:
-            $ref: '#/components/schemas/OpenAITokenLogProb'
-          description: >-
-            (Optional) The log probabilities for the tokens in the message
-      additionalProperties: false
-      title: OpenAIChoiceLogprobs
-      description: >-
-        The log probabilities for the tokens in the message from an OpenAI-compatible
-        chat completion response.
    OpenAIChunkChoice:
      type: object
      properties:
@ -7445,49 +7672,6 @@ components:
      title: OpenAIChunkChoice
      description: >-
        A chunk choice from an OpenAI-compatible chat completion streaming response.
-    OpenAITokenLogProb:
-      type: object
-      properties:
-        token:
-          type: string
-        bytes:
-          type: array
-          items:
-            type: integer
-        logprob:
-          type: number
-        top_logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/OpenAITopLogProb'
-      additionalProperties: false
-      required:
-        - token
-        - logprob
-        - top_logprobs
-      title: OpenAITokenLogProb
-      description: >-
-        The log probability for a token from an OpenAI-compatible chat completion
-        response.
-    OpenAITopLogProb:
-      type: object
-      properties:
-        token:
-          type: string
-        bytes:
-          type: array
-          items:
-            type: integer
-        logprob:
-          type: number
-      additionalProperties: false
-      required:
-        - token
-        - logprob
-      title: OpenAITopLogProb
-      description: >-
-        The top log probability for a token from an OpenAI-compatible chat completion
-        response.
    OpenaiCompletionRequest:
      type: object
      properties:
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -759,7 +759,7 @@ class Generator:
        )

        return Operation(
-            tags=[op.defining_class.__name__],
+            tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
            summary=None,
            # summary=doc_string.short_description,
            description=description,
@ -805,6 +805,8 @@ class Generator:
        operation_tags: List[Tag] = []
        for cls in endpoint_classes:
            doc_string = parse_type(cls)
+            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
+                continue
            operation_tags.append(
                Tag(
                    name=cls.__name__,
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -820,15 +820,32 @@ class BatchChatCompletionResponse(BaseModel):
    batch: list[ChatCompletionResponse]


+class OpenAICompletionWithInputMessages(OpenAIChatCompletion):
+    input_messages: list[OpenAIMessageParam]
+
+
+@json_schema_type
+class ListOpenAIChatCompletionResponse(BaseModel):
+    data: list[OpenAICompletionWithInputMessages]
+    has_more: bool
+    first_id: str
+    last_id: str
+    object: Literal["list"] = "list"
+
+
+class Order(Enum):
+    asc = "asc"
+    desc = "desc"
+
+
@runtime_checkable
@trace_protocol
-class Inference(Protocol):
-    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
-
-    This API provides the raw interface to the underlying models. Two kinds of models are supported:
-    - LLM models: these models generate "raw" and "chat" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search.
+class InferenceProvider(Protocol):
    """
+    This protocol defines the interface that should be implemented by all inference providers.
+    """
+
+    API_NAMESPACE: str = "Inference"

    model_store: ModelStore | None = None

@ -1062,3 +1079,39 @@ class Inference(Protocol):
        :returns: An OpenAIChatCompletion.
        """
        ...
+
+
+class Inference(InferenceProvider):
+    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
+
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
+    - LLM models: these models generate "raw" and "chat" (conversational) completions.
+    - Embedding models: these models generate embeddings to be used for semantic search.
+    """
+
+    @webmethod(route="/openai/v1/chat/completions", method="GET")
+    async def list_chat_completions(
+        self,
+        after: str | None = None,
+        limit: int | None = 20,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIChatCompletionResponse:
+        """List all chat completions.
+
+        :param after: The ID of the last chat completion to return.
+        :param limit: The maximum number of chat completions to return.
+        :param model: The model to filter by.
+        :param order: The order to sort the chat completions by: "asc" or "desc". Defaults to "desc".
+        :returns: A ListOpenAIChatCompletionResponse.
+        """
+        raise NotImplementedError("List chat completions is not implemented")
+
+    @webmethod(route="/openai/v1/chat/completions/{completion_id}", method="GET")
+    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
+        """Describe a chat completion by its ID.
+
+        :param completion_id: ID of the chat completion.
+        :returns: A OpenAICompletionWithInputMessages.
+        """
+        raise NotImplementedError("Get chat completion is not implemented")
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@ -12,6 +12,7 @@ import shutil
 import sys
 import textwrap
 from functools import lru_cache
+from importlib.abc import Traversable
 from pathlib import Path

 import yaml
@ -250,11 +251,10 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
        sys.exit(1)

    if args.run:
-        run_config = Path(run_config)
        config_dict = yaml.safe_load(run_config.read_text())
        config = parse_and_maybe_upgrade_config(config_dict)
-        if not os.path.exists(str(config.external_providers_dir)):
-            os.makedirs(str(config.external_providers_dir), exist_ok=True)
+        if not os.path.exists(config.external_providers_dir):
+            os.makedirs(config.external_providers_dir, exist_ok=True)
        run_args = formulate_run_args(args.image_type, args.image_name, config, args.template)
        run_args.extend([str(os.getenv("LLAMA_STACK_PORT", 8321)), "--config", run_config])
        run_command(run_args)
@ -264,7 +264,7 @@ def _generate_run_config(
    build_config: BuildConfig,
    build_dir: Path,
    image_name: str,
-) -> str:
+) -> Path:
    """
    Generate a run.yaml template file for user to edit from a build.yaml file
    """
@ -343,7 +343,7 @@ def _run_stack_build_command_from_build_config(
    image_name: str | None = None,
    template_name: str | None = None,
    config_path: str | None = None,
-) -> str:
+) -> Path | Traversable:
    image_name = image_name or build_config.image_name
    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
        if template_name:
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@ -340,8 +340,17 @@ class BuildConfig(BaseModel):
        default=None,
        description="Name of the distribution to build",
    )
-    external_providers_dir: str | None = Field(
+    external_providers_dir: Path | None = Field(
        default=None,
        description="Path to directory containing external provider implementations. The providers packages will be resolved from this directory. "
        "pip_packages MUST contain the provider package name.",
    )
+
+    @field_validator("external_providers_dir")
+    @classmethod
+    def validate_external_providers_dir(cls, v):
+        if v is None:
+            return None
+        if isinstance(v, str):
+            return Path(v)
+        return v
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@ -226,6 +226,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
                    distribution_spec=DistributionSpec(
                        providers=provider_types,
                    ),
+                    external_providers_dir=self.config.external_providers_dir,
                )
                print_pip_install_help(build_config)
            else:
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@ -13,7 +13,7 @@ from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval import Eval
 from llama_stack.apis.files import Files
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import Inference, InferenceProvider
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
@ -83,6 +83,13 @@ def api_protocol_map() -> dict[Api, Any]:
    }


+def api_protocol_map_for_compliance_check() -> dict[Api, Any]:
+    return {
+        **api_protocol_map(),
+        Api.inference: InferenceProvider,
+    }
+
+
 def additional_protocols_map() -> dict[Api, Any]:
    return {
        Api.inference: (ModelsProtocolPrivate, Models, Api.models),
@ -302,9 +309,6 @@ async def instantiate_provider(
    inner_impls: dict[str, Any],
    dist_registry: DistributionRegistry,
 ):
-    protocols = api_protocol_map()
-    additional_protocols = additional_protocols_map()
-
    provider_spec = provider.spec
    if not hasattr(provider_spec, "module"):
        raise AttributeError(f"ProviderSpec of type {type(provider_spec)} does not have a 'module' attribute")
@ -342,6 +346,8 @@ async def instantiate_provider(
    impl.__provider_spec__ = provider_spec
    impl.__provider_config__ = config

+    protocols = api_protocol_map_for_compliance_check()
+    additional_protocols = additional_protocols_map()
    # TODO: check compliance for special tool groups
    # the impl should be for Api.tool_runtime, the name should be the special tool group, the protocol should be the special tool group protocol
    check_protocol_compliance(impl, protocols[provider_spec.api])
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@ -280,7 +280,18 @@ class TracingMiddleware:
            logger.debug(f"No matching endpoint found for path: {path}, falling back to FastAPI")
            return await self.app(scope, receive, send)

-        trace_context = await start_trace(trace_path, {"__location__": "server", "raw_path": path})
+        trace_attributes = {"__location__": "server", "raw_path": path}
+
+        # Extract W3C trace context headers and store as trace attributes
+        headers = dict(scope.get("headers", []))
+        traceparent = headers.get(b"traceparent", b"").decode()
+        if traceparent:
+            trace_attributes["traceparent"] = traceparent
+        tracestate = headers.get(b"tracestate", b"").decode()
+        if tracestate:
+            trace_attributes["tracestate"] = tracestate
+
+        trace_context = await start_trace(trace_path, trace_attributes)

        async def send_with_trace_id(message):
            if message["type"] == "http.response.start":
@ -370,14 +381,6 @@ def main(args: argparse.Namespace | None = None):
    if args is None:
        args = parser.parse_args()

-    # Check for deprecated argument usage
-    if "--config" in sys.argv:
-        warnings.warn(
-            "The '--config' argument is deprecated and will be removed in a future version. Use '--config' instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-
    log_line = ""
    if args.config:
        # if the user provided a config file, use it, even if template was specified
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
    CompletionRequest,
    CompletionResponse,
    CompletionResponseStreamChunk,
-    Inference,
+    InferenceProvider,
    InterleavedContent,
    LogProbConfig,
    Message,
@ -86,7 +86,7 @@ class MetaReferenceInferenceImpl(
    OpenAICompletionToLlamaStackMixin,
    OpenAIChatCompletionToLlamaStackMixin,
    SentenceTransformerEmbeddingMixin,
-    Inference,
+    InferenceProvider,
    ModelsProtocolPrivate,
 ):
    def __init__(self, config: MetaReferenceInferenceConfig) -> None:
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -9,7 +9,7 @@ from collections.abc import AsyncGenerator

 from llama_stack.apis.inference import (
    CompletionResponse,
-    Inference,
+    InferenceProvider,
    InterleavedContent,
    LogProbConfig,
    Message,
@ -38,7 +38,7 @@ class SentenceTransformersInferenceImpl(
    OpenAIChatCompletionToLlamaStackMixin,
    OpenAICompletionToLlamaStackMixin,
    SentenceTransformerEmbeddingMixin,
-    Inference,
+    InferenceProvider,
    ModelsProtocolPrivate,
 ):
    def __init__(self, config: SentenceTransformersInferenceConfig) -> None:
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@ -16,6 +16,7 @@ from opentelemetry.sdk.resources import Resource
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.semconv.resource import ResourceAttributes
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator

 from llama_stack.apis.telemetry import (
    Event,
@ -44,6 +45,7 @@ from llama_stack.providers.inline.telemetry.meta_reference.sqlite_span_processor
 )
 from llama_stack.providers.utils.telemetry.dataset_mixin import TelemetryDatasetMixin
 from llama_stack.providers.utils.telemetry.sqlite_trace_store import SQLiteTraceStore
+from llama_stack.providers.utils.telemetry.tracing import ROOT_SPAN_MARKERS

 from .config import TelemetryConfig, TelemetrySink

@ -206,6 +208,15 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                event.attributes = {}
            event.attributes["__ttl__"] = ttl_seconds

+            # Extract these W3C trace context attributes so they are not written to
+            # underlying storage, as we just need them to propagate the trace context.
+            traceparent = event.attributes.pop("traceparent", None)
+            tracestate = event.attributes.pop("tracestate", None)
+            if traceparent:
+                # If we have a traceparent header value, we're not the root span.
+                for root_attribute in ROOT_SPAN_MARKERS:
+                    event.attributes.pop(root_attribute, None)
+
            if isinstance(event.payload, SpanStartPayload):
                # Check if span already exists to prevent duplicates
                if span_id in _GLOBAL_STORAGE["active_spans"]:
@ -216,8 +227,12 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
                    parent_span_id = int(event.payload.parent_span_id, 16)
                    parent_span = _GLOBAL_STORAGE["active_spans"].get(parent_span_id)
                    context = trace.set_span_in_context(parent_span)
-                else:
-                    event.attributes["__root_span__"] = "true"
+                elif traceparent:
+                    carrier = {
+                        "traceparent": traceparent,
+                        "tracestate": tracestate,
+                    }
+                    context = TraceContextTextMapPropagator().extract(carrier=carrier)

                span = tracer.start_span(
                    name=event.payload.name,
--- a/llama_stack/providers/remote/inference/cerebras_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider

 from .config import CerebrasCompatConfig


-async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .cerebras import CerebrasCompatInferenceAdapter

--- a/llama_stack/providers/remote/inference/fireworks_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider

 from .config import FireworksCompatConfig


-async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .fireworks import FireworksCompatInferenceAdapter

--- a/llama_stack/providers/remote/inference/groq_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/groq_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider

 from .config import GroqCompatConfig


-async def get_adapter_impl(config: GroqCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: GroqCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .groq import GroqCompatInferenceAdapter

--- a/llama_stack/providers/remote/inference/llama_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider

 from .config import LlamaCompatConfig


-async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .llama import LlamaCompatInferenceAdapter

--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
    EmbeddingsResponse,
    EmbeddingTaskType,
    GrammarResponseFormat,
-    Inference,
+    InferenceProvider,
    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
@ -82,7 +82,7 @@ logger = get_logger(name=__name__, category="inference")


 class OllamaInferenceAdapter(
-    Inference,
+    InferenceProvider,
    ModelsProtocolPrivate,
 ):
    def __init__(self, url: str) -> None:
--- a/llama_stack/providers/remote/inference/sambanova_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider

 from .config import SambaNovaCompatConfig


-async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .sambanova import SambaNovaCompatInferenceAdapter

--- a/llama_stack/providers/remote/inference/together_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/together_openai_compat/init.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import InferenceProvider

 from .config import TogetherCompatConfig


-async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> Inference:
+async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> InferenceProvider:
    # import dynamically so the import is used only when it is needed
    from .together import TogetherCompatInferenceAdapter

--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -19,7 +19,7 @@ from llama_stack.apis.inference import (
    ChatCompletionResponseStreamChunk,
    EmbeddingsResponse,
    EmbeddingTaskType,
-    Inference,
+    InferenceProvider,
    JsonSchemaResponseFormat,
    LogProbConfig,
    Message,
@ -59,7 +59,7 @@ logger = get_logger(name=__name__, category="inference")

 class LiteLLMOpenAIMixin(
    ModelRegistryHelper,
-    Inference,
+    InferenceProvider,
    NeedsRequestProviderData,
 ):
    # TODO: avoid exposing the litellm specific model names to the user.
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@ -34,6 +34,8 @@ logger = get_logger(__name__, category="core")
 INVALID_SPAN_ID = 0x0000000000000000
 INVALID_TRACE_ID = 0x00000000000000000000000000000000

+ROOT_SPAN_MARKERS = ["__root__", "__root_span__"]
+

 def trace_id_to_str(trace_id: int) -> str:
    """Convenience trace ID formatting method
@ -178,7 +180,8 @@ async def start_trace(name: str, attributes: dict[str, Any] = None) -> TraceCont

    trace_id = generate_trace_id()
    context = TraceContext(BACKGROUND_LOGGER, trace_id)
-    context.push_span(name, {"__root__": True, **(attributes or {})})
+    attributes = {marker: True for marker in ROOT_SPAN_MARKERS} | (attributes or {})
+    context.push_span(name, attributes)

    CURRENT_TRACE_CONTEXT.set(context)
    return context