chore(apis): unpublish deprecated /v1/inference apis (#3297)

# What does this PR do? unpublish (make unavailable to users) the following apis - - `/v1/inference/completion`, replaced by `/v1/openai/v1/completions` - `/v1/inference/chat-completion`, replaced by `/v1/openai/v1/chat/completions` - `/v1/inference/embeddings`, replaced by `/v1/openai/v1/embeddings` - `/v1/inference/batch-completion`, replaced by `/v1/openai/v1/batches` - `/v1/inference/batch-chat-completion`, replaced by `/v1/openai/v1/batches` note: the implementations are still available for internal use, e.g. agents uses chat-completion.
2025-12-03 09:53:45 +00:00 · 2025-09-27 14:20:06 -04:00 · 2025-09-27 14:20:06 -04:00 · 53b15725b6
commit 53b15725b6
parent 60484c5c4e
23 changed files with 3134 additions and 1347 deletions
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -210,55 +210,6 @@
                }
            }
        },
-        "/v1/inference/completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/CompletionResponse"
-                                }
-                            },
-                            "text/event-stream": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/CompletionResponseStreamChunk"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "summary": "Generate a completion for the given content using the specified model.",
-                "description": "Generate a completion for the given content using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/CompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
        "/v1/agents": {
            "get": {
                "responses": {
@ -7299,126 +7250,6 @@
                "title": "ToolCallDelta",
                "description": "A tool call content delta for streaming responses."
            },
-            "CompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content to generate a completion for."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
-                    },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "content"
-                ],
-                "title": "CompletionRequest"
-            },
-            "CompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "content": {
-                        "type": "string",
-                        "description": "The generated completion text"
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Reason why generation stopped"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "content",
-                    "stop_reason"
-                ],
-                "title": "CompletionResponse",
-                "description": "Response from a completion request."
-            },
-            "CompletionResponseStreamChunk": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "delta": {
-                        "type": "string",
-                        "description": "New content generated since last chunk. This can be one or more tokens."
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Optional reason why generation stopped, if complete"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "delta"
-                ],
-                "title": "CompletionResponseStreamChunk",
-                "description": "A chunk of a streamed completion response."
-            },
            "AgentConfig": {
                "type": "object",
                "properties": {
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -132,43 +132,6 @@ paths:
            schema:
              $ref: '#/components/schemas/ChatCompletionRequest'
        required: true
-  /v1/inference/completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            If stream=False, returns a CompletionResponse with the full completion.
-            If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/CompletionResponse'
-            text/event-stream:
-              schema:
-                $ref: '#/components/schemas/CompletionResponseStreamChunk'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      summary: >-
-        Generate a completion for the given content using the specified model.
-      description: >-
-        Generate a completion for the given content using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/CompletionRequest'
-        required: true
  /v1/agents:
    get:
      responses:
@ -5292,112 +5255,6 @@ components:
      title: ToolCallDelta
      description: >-
        A tool call content delta for streaming responses.
-    CompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The content to generate a completion for.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding.
-        stream:
-          type: boolean
-          description: >-
-            (Optional) If True, generate an SSE event stream of the response. Defaults
-            to False.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - content
-      title: CompletionRequest
-    CompletionResponse:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        content:
-          type: string
-          description: The generated completion text
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: Reason why generation stopped
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - content
-        - stop_reason
-      title: CompletionResponse
-      description: Response from a completion request.
-    CompletionResponseStreamChunk:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        delta:
-          type: string
-          description: >-
-            New content generated since last chunk. This can be one or more tokens.
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Optional reason why generation stopped, if complete
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - delta
-      title: CompletionResponseStreamChunk
-      description: >-
-        A chunk of a streamed completion response.
    AgentConfig:
      type: object
      properties:
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -1008,7 +1008,6 @@ class InferenceProvider(Protocol):

    model_store: ModelStore | None = None

-    @webmethod(route="/inference/completion", method="POST", level=LLAMA_STACK_API_V1)
    async def completion(
        self,
        model_id: str,
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@ -192,6 +192,14 @@ async def localize_image_content(uri: str) -> tuple[bytes, str] | None:
                format = "png"

        return content, format
+    elif uri.startswith("data"):
+        # data:image/{format};base64,{data}
+        match = re.match(r"data:image/(\w+);base64,(.+)", uri)
+        if not match:
+            raise ValueError(f"Invalid data URL format, {uri[:40]}...")
+        fmt, image_data = match.groups()
+        content = base64.b64decode(image_data)
+        return content, fmt
    else:
        return None

--- a/tests/integration/inference/test_embedding.py
+++ b/tests/integration/inference/test_embedding.py
@ -1,303 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-#
-# Test plan:
-#
-#  Types of input:
-#   - array of a string
-#   - array of a image (ImageContentItem, either URL or base64 string)
-#   - array of a text (TextContentItem)
-#  Types of output:
-#   - list of list of floats
-#  Params:
-#   - text_truncation
-#     - absent w/ long text -> error
-#     - none w/ long text -> error
-#     - absent w/ short text -> ok
-#     - none w/ short text -> ok
-#     - end w/ long text -> ok
-#     - end w/ short text -> ok
-#     - start w/ long text -> ok
-#     - start w/ short text -> ok
-#   - output_dimension
-#     - response dimension matches
-#   - task_type, only for asymmetric models
-#     - query embedding != passage embedding
-#  Negative:
-#   - long string
-#   - long text
-#
-# Todo:
-#  - negative tests
-#    - empty
-#      - empty list
-#      - empty string
-#      - empty text
-#      - empty image
-#    - long
-#      - large image
-#      - appropriate combinations
-#    - batch size
-#      - many inputs
-#    - invalid
-#      - invalid URL
-#      - invalid base64
-#
-# Notes:
-#  - use llama_stack_client fixture
-#  - use pytest.mark.parametrize when possible
-#  - no accuracy tests: only check the type of output, not the content
-#
-
-import pytest
-from llama_stack_client import BadRequestError as LlamaStackBadRequestError
-from llama_stack_client.types import EmbeddingsResponse
-from llama_stack_client.types.shared.interleaved_content import (
-    ImageContentItem,
-    ImageContentItemImage,
-    ImageContentItemImageURL,
-    TextContentItem,
-)
-from openai import BadRequestError as OpenAIBadRequestError
-
-from llama_stack.core.library_client import LlamaStackAsLibraryClient
-
-DUMMY_STRING = "hello"
-DUMMY_STRING2 = "world"
-DUMMY_LONG_STRING = "NVDA " * 10240
-DUMMY_TEXT = TextContentItem(text=DUMMY_STRING, type="text")
-DUMMY_TEXT2 = TextContentItem(text=DUMMY_STRING2, type="text")
-DUMMY_LONG_TEXT = TextContentItem(text=DUMMY_LONG_STRING, type="text")
-# TODO(mf): add a real image URL and base64 string
-DUMMY_IMAGE_URL = ImageContentItem(
-    image=ImageContentItemImage(url=ImageContentItemImageURL(uri="https://example.com/image.jpg")), type="image"
-)
-DUMMY_IMAGE_BASE64 = ImageContentItem(image=ImageContentItemImage(data="base64string"), type="image")
-SUPPORTED_PROVIDERS = {"remote::nvidia"}
-MODELS_SUPPORTING_MEDIA = {}
-MODELS_SUPPORTING_OUTPUT_DIMENSION = {"nvidia/llama-3.2-nv-embedqa-1b-v2"}
-MODELS_REQUIRING_TASK_TYPE = {
-    "nvidia/llama-3.2-nv-embedqa-1b-v2",
-    "nvidia/nv-embedqa-e5-v5",
-    "nvidia/nv-embedqa-mistral-7b-v2",
-    "snowflake/arctic-embed-l",
-}
-MODELS_SUPPORTING_TASK_TYPE = MODELS_REQUIRING_TASK_TYPE
-
-
-def default_task_type(model_id):
-    """
-    Some models require a task type parameter. This provides a default value for
-    testing those models.
-    """
-    if model_id in MODELS_REQUIRING_TASK_TYPE:
-        return {"task_type": "query"}
-    return {}
-
-
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_STRING, DUMMY_STRING2],
-        [DUMMY_TEXT, DUMMY_TEXT2],
-    ],
-    ids=[
-        "list[string]",
-        "list[text]",
-    ],
-)
-def test_embedding_text(llama_stack_client, embedding_model_id, contents, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_IMAGE_URL, DUMMY_IMAGE_BASE64],
-        [DUMMY_IMAGE_URL, DUMMY_STRING, DUMMY_IMAGE_BASE64, DUMMY_TEXT],
-    ],
-    ids=[
-        "list[url,base64]",
-        "list[url,string,base64,text]",
-    ],
-)
-def test_embedding_image(llama_stack_client, embedding_model_id, contents, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    if embedding_model_id not in MODELS_SUPPORTING_MEDIA:
-        pytest.xfail(f"{embedding_model_id} doesn't support media")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        "end",
-        "start",
-    ],
-)
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_LONG_TEXT],
-        [DUMMY_STRING],
-    ],
-    ids=[
-        "long",
-        "short",
-    ],
-)
-def test_embedding_truncation(
-    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
-):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id,
-        contents=contents,
-        text_truncation=text_truncation,
-        **default_task_type(embedding_model_id),
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == 1
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        None,
-        "none",
-    ],
-)
-@pytest.mark.parametrize(
-    "contents",
-    [
-        [DUMMY_LONG_TEXT],
-        [DUMMY_LONG_STRING],
-    ],
-    ids=[
-        "long-text",
-        "long-str",
-    ],
-)
-def test_embedding_truncation_error(
-    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
-):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    # Using LlamaStackClient from llama_stack_client will raise llama_stack_client.BadRequestError
-    # While using LlamaStackAsLibraryClient from llama_stack.distribution.library_client will raise the error that the backend raises
-    error_type = (
-        OpenAIBadRequestError
-        if isinstance(llama_stack_client, LlamaStackAsLibraryClient)
-        else LlamaStackBadRequestError
-    )
-    with pytest.raises(error_type):
-        llama_stack_client.inference.embeddings(
-            model_id=embedding_model_id,
-            contents=[DUMMY_LONG_TEXT],
-            text_truncation=text_truncation,
-            **default_task_type(embedding_model_id),
-        )
-
-
-def test_embedding_output_dimension(llama_stack_client, embedding_model_id, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    if embedding_model_id not in MODELS_SUPPORTING_OUTPUT_DIMENSION:
-        pytest.xfail(f"{embedding_model_id} doesn't support output_dimension")
-    base_response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], **default_task_type(embedding_model_id)
-    )
-    test_response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id,
-        contents=[DUMMY_STRING],
-        **default_task_type(embedding_model_id),
-        output_dimension=32,
-    )
-    assert len(base_response.embeddings[0]) != len(test_response.embeddings[0])
-    assert len(test_response.embeddings[0]) == 32
-
-
-def test_embedding_task_type(llama_stack_client, embedding_model_id, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    if embedding_model_id not in MODELS_SUPPORTING_TASK_TYPE:
-        pytest.xfail(f"{embedding_model_id} doesn't support task_type")
-    query_embedding = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="query"
-    )
-    document_embedding = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="document"
-    )
-    assert query_embedding.embeddings != document_embedding.embeddings
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        None,
-        "none",
-        "end",
-        "start",
-    ],
-)
-def test_embedding_text_truncation(llama_stack_client, embedding_model_id, text_truncation, inference_provider_type):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    response = llama_stack_client.inference.embeddings(
-        model_id=embedding_model_id,
-        contents=[DUMMY_STRING],
-        text_truncation=text_truncation,
-        **default_task_type(embedding_model_id),
-    )
-    assert isinstance(response, EmbeddingsResponse)
-    assert len(response.embeddings) == 1
-    assert isinstance(response.embeddings[0], list)
-    assert isinstance(response.embeddings[0][0], float)
-
-
-@pytest.mark.parametrize(
-    "text_truncation",
-    [
-        "NONE",
-        "END",
-        "START",
-        "left",
-        "right",
-    ],
-)
-def test_embedding_text_truncation_error(
-    llama_stack_client, embedding_model_id, text_truncation, inference_provider_type
-):
-    if inference_provider_type not in SUPPORTED_PROVIDERS:
-        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
-    error_type = ValueError if isinstance(llama_stack_client, LlamaStackAsLibraryClient) else LlamaStackBadRequestError
-    with pytest.raises(error_type):
-        llama_stack_client.inference.embeddings(
-            model_id=embedding_model_id,
-            contents=[DUMMY_STRING],
-            text_truncation=text_truncation,
-            **default_task_type(embedding_model_id),
-        )
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -9,6 +9,7 @@ import time
 import unicodedata

 import pytest
+from pydantic import BaseModel

 from ..test_cases.test_case import TestCase

@ -62,6 +63,14 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")


+def skip_if_doesnt_support_completions_logprobs(client_with_models, model_id):
+    provider_type = provider_from_model(client_with_models, model_id).provider_type
+    if provider_type in (
+        "remote::ollama",  # logprobs is ignored
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider_type} doesn't support /v1/completions logprobs.")
+
+
 def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
    # To test `fim` ( fill in the middle ) completion, we need to use a model that supports suffix.
    # Use this to specifically test this API functionality.
@ -205,28 +214,6 @@ def test_openai_completion_streaming(llama_stack_client, client_with_models, tex
    assert len(content_str) > 10


-@pytest.mark.parametrize(
-    "prompt_logprobs",
-    [
-        1,
-        0,
-    ],
-)
-def test_openai_completion_prompt_logprobs(llama_stack_client, client_with_models, text_model_id, prompt_logprobs):
-    skip_if_provider_isnt_vllm(client_with_models, text_model_id)
-
-    prompt = "Hello, world!"
-    response = llama_stack_client.completions.create(
-        model=text_model_id,
-        prompt=prompt,
-        stream=False,
-        prompt_logprobs=prompt_logprobs,
-    )
-    assert len(response.choices) > 0
-    choice = response.choices[0]
-    assert len(choice.prompt_logprobs) > 0
-
-
 def test_openai_completion_guided_choice(llama_stack_client, client_with_models, text_model_id):
    skip_if_provider_isnt_vllm(client_with_models, text_model_id)

@ -518,3 +505,214 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
    message_content = response.choices[0].message.content.lower().strip()
    normalized_content = _normalize_text(message_content)
    assert "hello world" in normalized_content
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:stop_sequence",
+    ],
+)
+def test_openai_completion_stop_sequence(client_with_models, openai_client, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
+
+    tc = TestCase(test_case)
+
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=tc["content"],
+        stop="1963",
+        stream=False,
+    )
+    assert len(response.choices) > 0
+    choice = response.choices[0]
+    assert "1963" not in choice.text
+
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=tc["content"],
+        stop=["blathering", "1963"],
+        stream=False,
+    )
+    assert len(response.choices) > 0
+    choice = response.choices[0]
+    assert "1963" not in choice.text
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:log_probs",
+    ],
+)
+def test_openai_completion_logprobs(client_with_models, openai_client, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
+    skip_if_doesnt_support_completions_logprobs(client_with_models, text_model_id)
+
+    tc = TestCase(test_case)
+
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=tc["content"],
+        logprobs=5,
+    )
+    assert len(response.choices) > 0
+    choice = response.choices[0]
+    assert choice.text, "Response text should not be empty"
+    assert choice.logprobs, "Logprobs should not be empty"
+    logprobs = choice.logprobs
+    assert logprobs.token_logprobs, "Response tokens should not be empty"
+    assert len(logprobs.tokens) == len(logprobs.token_logprobs)
+    assert len(logprobs.token_logprobs) == len(logprobs.top_logprobs)
+    for i, (token, prob) in enumerate(zip(logprobs.tokens, logprobs.token_logprobs, strict=True)):
+        assert logprobs.top_logprobs[i][token] == prob
+        assert len(logprobs.top_logprobs[i]) == 5
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:log_probs",
+    ],
+)
+def test_openai_completion_logprobs_streaming(client_with_models, openai_client, text_model_id, test_case):
+    skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
+    skip_if_doesnt_support_completions_logprobs(client_with_models, text_model_id)
+
+    tc = TestCase(test_case)
+
+    response = openai_client.completions.create(
+        model=text_model_id,
+        prompt=tc["content"],
+        logprobs=3,
+        stream=True,
+        max_tokens=5,
+    )
+    for chunk in response:
+        choice = chunk.choices[0]
+        choice = response.choices[0]
+        if choice.text:  # if there's a token, we expect logprobs
+            assert choice.logprobs, "Logprobs should not be empty"
+            logprobs = choice.logprobs
+            assert logprobs.token_logprobs, "Response tokens should not be empty"
+            assert len(logprobs.tokens) == len(logprobs.token_logprobs)
+            assert len(logprobs.token_logprobs) == len(logprobs.top_logprobs)
+            for i, (token, prob) in enumerate(zip(logprobs.tokens, logprobs.token_logprobs, strict=True)):
+                assert logprobs.top_logprobs[i][token] == prob
+                assert len(logprobs.top_logprobs[i]) == 3
+        else:  # no token, no logprobs
+            assert not choice.logprobs, "Logprobs should be empty"
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling",
+    ],
+)
+def test_openai_chat_completion_with_tools(openai_client, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=tc["messages"],
+        tools=tc["tools"],
+        tool_choice="auto",
+        stream=False,
+    )
+    assert len(response.choices) == 1
+    assert len(response.choices[0].message.tool_calls) == 1
+    tool_call = response.choices[0].message.tool_calls[0]
+    assert tool_call.function.name == tc["tools"][0]["function"]["name"]
+    assert "location" in tool_call.function.arguments
+    assert tc["expected"]["location"] in tool_call.function.arguments
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling",
+    ],
+)
+def test_openai_chat_completion_with_tools_and_streaming(openai_client, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=tc["messages"],
+        tools=tc["tools"],
+        tool_choice="auto",
+        stream=True,
+    )
+    # Accumulate tool calls from streaming chunks
+    tool_calls = []
+    for chunk in response:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            for i, tc_delta in enumerate(chunk.choices[0].delta.tool_calls):
+                while len(tool_calls) <= i:
+                    tool_calls.append({"function": {"name": "", "arguments": ""}})
+                if tc_delta.function and tc_delta.function.name:
+                    tool_calls[i]["function"]["name"] = tc_delta.function.name
+                if tc_delta.function and tc_delta.function.arguments:
+                    tool_calls[i]["function"]["arguments"] += tc_delta.function.arguments
+    assert len(tool_calls) == 1
+    tool_call = tool_calls[0]
+    assert tool_call["function"]["name"] == tc["tools"][0]["function"]["name"]
+    assert "location" in tool_call["function"]["arguments"]
+    assert tc["expected"]["location"] in tool_call["function"]["arguments"]
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling",
+    ],
+)
+def test_openai_chat_completion_with_tool_choice_none(openai_client, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=tc["messages"],
+        tools=tc["tools"],
+        tool_choice="none",
+        stream=False,
+    )
+    assert len(response.choices) == 1
+    tool_calls = response.choices[0].message.tool_calls
+    assert tool_calls is None or len(tool_calls) == 0
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:structured_output",
+    ],
+)
+def test_openai_chat_completion_structured_output(openai_client, text_model_id, test_case):
+    # Note: Skip condition may need adjustment for OpenAI client
+    class AnswerFormat(BaseModel):
+        first_name: str
+        last_name: str
+        year_of_birth: int
+
+    tc = TestCase(test_case)
+
+    response = openai_client.chat.completions.create(
+        model=text_model_id,
+        messages=tc["messages"],
+        response_format={
+            "type": "json_schema",
+            "json_schema": {
+                "name": "AnswerFormat",
+                "schema": AnswerFormat.model_json_schema(),
+            },
+        },
+        stream=False,
+    )
+    print(response.choices[0].message.content)
+    answer = AnswerFormat.model_validate_json(response.choices[0].message.content)
+    expected = tc["expected"]
+    assert answer.first_name == expected["first_name"]
+    assert answer.last_name == expected["last_name"]
+    assert answer.year_of_birth == expected["year_of_birth"]
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -1,545 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from time import sleep
-
-import pytest
-from pydantic import BaseModel
-
-from llama_stack.models.llama.sku_list import resolve_model
-
-from ..test_cases.test_case import TestCase
-
-PROVIDER_LOGPROBS_TOP_K = {"remote::together", "remote::fireworks", "remote::vllm"}
-
-
-def skip_if_model_doesnt_support_completion(client_with_models, model_id):
-    models = {m.identifier: m for m in client_with_models.models.list()}
-    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
-    provider_id = models[model_id].provider_id
-    providers = {p.provider_id: p for p in client_with_models.providers.list()}
-    provider = providers[provider_id]
-    if (
-        provider.provider_type
-        in (
-            "remote::openai",
-            "remote::anthropic",
-            "remote::gemini",
-            "remote::vertexai",
-            "remote::groq",
-            "remote::sambanova",
-            "remote::azure",
-        )
-        or "openai-compat" in provider.provider_type
-    ):
-        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support completion")
-
-
-def skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, model_id):
-    models = {m.identifier: m for m in client_with_models.models.list()}
-    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
-    provider_id = models[model_id].provider_id
-    providers = {p.provider_id: p for p in client_with_models.providers.list()}
-    provider = providers[provider_id]
-    if provider.provider_type in ("remote::sambanova", "remote::azure", "remote::watsonx"):
-        pytest.skip(
-            f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output"
-        )
-
-
-def get_llama_model(client_with_models, model_id):
-    models = {}
-    for m in client_with_models.models.list():
-        models[m.identifier] = m
-        models[m.provider_resource_id] = m
-
-    assert model_id in models, f"Model {model_id} not found"
-
-    model = models[model_id]
-    ids = (model.identifier, model.provider_resource_id)
-    for mid in ids:
-        if resolve_model(mid):
-            return mid
-
-    return model.metadata.get("llama_model", None)
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:sanity",
-    ],
-)
-def test_text_completion_non_streaming(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=False,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-        },
-    )
-    assert len(response.content) > 10
-    # assert "blue" in response.content.lower().strip()
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:sanity",
-    ],
-)
-def test_text_completion_streaming(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=True,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-        },
-    )
-    streamed_content = [chunk.delta for chunk in response]
-    content_str = "".join(streamed_content).lower().strip()
-    # assert "blue" in content_str
-    assert len(content_str) > 10
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:stop_sequence",
-    ],
-)
-def test_text_completion_stop_sequence(client_with_models, text_model_id, inference_provider_type, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    # This is only supported/tested for remote vLLM: https://github.com/meta-llama/llama-stack/issues/1771
-    if inference_provider_type != "remote::vllm":
-        pytest.xfail(f"{inference_provider_type} doesn't support 'stop' parameter yet")
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=True,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 50,
-            "stop": ["1963"],
-        },
-    )
-    streamed_content = [chunk.delta for chunk in response]
-    content_str = "".join(streamed_content).lower().strip()
-    assert "1963" not in content_str
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:log_probs",
-    ],
-)
-def test_text_completion_log_probs_non_streaming(client_with_models, text_model_id, inference_provider_type, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K:
-        pytest.xfail(f"{inference_provider_type} doesn't support log probs yet")
-
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=False,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 5,
-        },
-        logprobs={
-            "top_k": 1,
-        },
-    )
-    assert response.logprobs, "Logprobs should not be empty"
-    assert 1 <= len(response.logprobs) <= 5  # each token has 1 logprob and here max_tokens=5
-    assert all(len(logprob.logprobs_by_token) == 1 for logprob in response.logprobs)
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:log_probs",
-    ],
-)
-def test_text_completion_log_probs_streaming(client_with_models, text_model_id, inference_provider_type, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K:
-        pytest.xfail(f"{inference_provider_type} doesn't support log probs yet")
-
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.completion(
-        content=tc["content"],
-        stream=True,
-        model_id=text_model_id,
-        sampling_params={
-            "max_tokens": 5,
-        },
-        logprobs={
-            "top_k": 1,
-        },
-    )
-    streamed_content = list(response)
-    for chunk in streamed_content:
-        if chunk.delta:  # if there's a token, we expect logprobs
-            assert chunk.logprobs, "Logprobs should not be empty"
-            assert all(len(logprob.logprobs_by_token) == 1 for logprob in chunk.logprobs)
-        else:  # no token, no logprobs
-            assert not chunk.logprobs, "Logprobs should be empty"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:completion:structured_output",
-    ],
-)
-def test_text_completion_structured_output(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
-    skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, text_model_id)
-
-    class AnswerFormat(BaseModel):
-        name: str
-        year_born: str
-        year_retired: str
-
-    tc = TestCase(test_case)
-
-    user_input = tc["user_input"]
-    response = client_with_models.inference.completion(
-        model_id=text_model_id,
-        content=user_input,
-        stream=False,
-        sampling_params={
-            "max_tokens": 50,
-        },
-        response_format={
-            "type": "json_schema",
-            "json_schema": AnswerFormat.model_json_schema(),
-        },
-    )
-    answer = AnswerFormat.model_validate_json(response.content)
-    expected = tc["expected"]
-    assert answer.name == expected["name"]
-    assert answer.year_born == expected["year_born"]
-    assert answer.year_retired == expected["year_retired"]
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:non_streaming_01",
-        "inference:chat_completion:non_streaming_02",
-    ],
-)
-def test_text_chat_completion_non_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-    question = tc["question"]
-    expected = tc["expected"]
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=[
-            {
-                "role": "user",
-                "content": question,
-            }
-        ],
-        stream=False,
-    )
-    message_content = response.completion_message.content.lower().strip()
-    assert len(message_content) > 0
-    assert expected.lower() in message_content
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:streaming_01",
-        "inference:chat_completion:streaming_02",
-    ],
-)
-def test_text_chat_completion_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-    question = tc["question"]
-    expected = tc["expected"]
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=[{"role": "user", "content": question}],
-        stream=True,
-        timeout=120,  # Increase timeout to 2 minutes for large conversation history
-    )
-    streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
-    assert len(streamed_content) > 0
-    assert expected.lower() in "".join(streamed_content)
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_calling_and_non_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_choice="auto",
-        stream=False,
-    )
-    # some models can return content for the response in addition to the tool call
-    assert response.completion_message.role == "assistant"
-
-    assert len(response.completion_message.tool_calls) == 1
-    assert response.completion_message.tool_calls[0].tool_name == tc["tools"][0]["tool_name"]
-    assert response.completion_message.tool_calls[0].arguments == tc["expected"]
-
-
-# Will extract streamed text and separate it from tool invocation content
-# The returned tool inovcation content will be a string so it's easy to comapare with expected value
-# e.g. "[get_weather, {'location': 'San Francisco, CA'}]"
-def extract_tool_invocation_content(response):
-    tool_invocation_content: str = ""
-    for chunk in response:
-        delta = chunk.event.delta
-        if delta.type == "tool_call" and delta.parse_status == "succeeded":
-            call = delta.tool_call
-            tool_invocation_content += f"[{call.tool_name}, {call.arguments}]"
-    return tool_invocation_content
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_calling_and_streaming(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_choice="auto",
-        stream=True,
-    )
-    tool_invocation_content = extract_tool_invocation_content(response)
-    expected_tool_name = tc["tools"][0]["tool_name"]
-    expected_argument = tc["expected"]
-    assert tool_invocation_content == f"[{expected_tool_name}, {expected_argument}]"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_choice_required(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_config={
-            "tool_choice": "required",
-        },
-        stream=True,
-    )
-    tool_invocation_content = extract_tool_invocation_content(response)
-    expected_tool_name = tc["tools"][0]["tool_name"]
-    expected_argument = tc["expected"]
-    assert tool_invocation_content == f"[{expected_tool_name}, {expected_argument}]"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling",
-    ],
-)
-def test_text_chat_completion_with_tool_choice_none(client_with_models, text_model_id, test_case):
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        tools=tc["tools"],
-        tool_config={"tool_choice": "none"},
-        stream=True,
-    )
-    tool_invocation_content = extract_tool_invocation_content(response)
-    assert tool_invocation_content == ""
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:structured_output",
-    ],
-)
-def test_text_chat_completion_structured_output(client_with_models, text_model_id, test_case):
-    skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, text_model_id)
-
-    class NBAStats(BaseModel):
-        year_for_draft: int
-        num_seasons_in_nba: int
-
-    class AnswerFormat(BaseModel):
-        first_name: str
-        last_name: str
-        year_of_birth: int
-        nba_stats: NBAStats
-
-    tc = TestCase(test_case)
-
-    response = client_with_models.inference.chat_completion(
-        model_id=text_model_id,
-        messages=tc["messages"],
-        response_format={
-            "type": "json_schema",
-            "json_schema": AnswerFormat.model_json_schema(),
-        },
-        stream=False,
-    )
-    answer = AnswerFormat.model_validate_json(response.completion_message.content)
-    expected = tc["expected"]
-    assert answer.first_name == expected["first_name"]
-    assert answer.last_name == expected["last_name"]
-    assert answer.year_of_birth == expected["year_of_birth"]
-    assert answer.nba_stats.num_seasons_in_nba == expected["num_seasons_in_nba"]
-    assert answer.nba_stats.year_for_draft == expected["year_for_draft"]
-
-
-@pytest.mark.parametrize("streaming", [True, False])
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        "inference:chat_completion:tool_calling_tools_absent",
-    ],
-)
-def test_text_chat_completion_tool_calling_tools_not_in_request(
-    client_with_models, text_model_id, test_case, streaming
-):
-    tc = TestCase(test_case)
-
-    # TODO: more dynamic lookup on tool_prompt_format for model family
-    tool_prompt_format = "json" if "3.1" in text_model_id else "python_list"
-    request = {
-        "model_id": text_model_id,
-        "messages": tc["messages"],
-        "tools": tc["tools"],
-        "tool_choice": "auto",
-        "tool_prompt_format": tool_prompt_format,
-        "stream": streaming,
-    }
-
-    response = client_with_models.inference.chat_completion(**request)
-
-    if streaming:
-        for chunk in response:
-            delta = chunk.event.delta
-            if delta.type == "tool_call" and delta.parse_status == "succeeded":
-                assert delta.tool_call.tool_name == "get_object_namespace_list"
-            if delta.type == "tool_call" and delta.parse_status == "failed":
-                # expect raw message that failed to parse in tool_call
-                assert isinstance(delta.tool_call, str)
-                assert len(delta.tool_call) > 0
-    else:
-        for tc in response.completion_message.tool_calls:
-            assert tc.tool_name == "get_object_namespace_list"
-
-
-@pytest.mark.parametrize(
-    "test_case",
-    [
-        # Tests if the model can handle simple messages like "Hi" or
-        # a message unrelated to one of the tool calls
-        "inference:chat_completion:text_then_tool",
-        # Tests if the model can do full tool call with responses correctly
-        "inference:chat_completion:tool_then_answer",
-        # Tests if model can generate multiple params and
-        # read outputs correctly
-        "inference:chat_completion:array_parameter",
-    ],
-)
-def test_text_chat_completion_with_multi_turn_tool_calling(client_with_models, text_model_id, test_case):
-    """This test tests the model's tool calling loop in various scenarios"""
-    if "llama-4" not in text_model_id.lower() and "llama4" not in text_model_id.lower():
-        pytest.xfail("Not tested for non-llama4 models yet")
-
-    tc = TestCase(test_case)
-    messages = []
-
-    # keep going until either
-    # 1. we have messages to test in multi-turn
-    # 2. no messages bust last message is tool response
-    while len(tc["messages"]) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"):
-        # do not take new messages if last message is tool response
-        if len(messages) == 0 or messages[-1]["role"] != "tool":
-            new_messages = tc["messages"].pop(0)
-            messages += new_messages
-
-        # pprint(messages)
-        response = client_with_models.inference.chat_completion(
-            model_id=text_model_id,
-            messages=messages,
-            tools=tc["tools"],
-            stream=False,
-            sampling_params={
-                "strategy": {
-                    "type": "top_p",
-                    "top_p": 0.9,
-                    "temperature": 0.6,
-                }
-            },
-        )
-        op_msg = response.completion_message
-        messages.append(op_msg.model_dump())
-        # print(op_msg)
-
-        assert op_msg.role == "assistant"
-        expected = tc["expected"].pop(0)
-        assert len(op_msg.tool_calls) == expected["num_tool_calls"]
-
-        if expected["num_tool_calls"] > 0:
-            assert op_msg.tool_calls[0].tool_name == expected["tool_name"]
-            assert op_msg.tool_calls[0].arguments == expected["tool_arguments"]
-
-            tool_response = tc["tool_responses"].pop(0)
-            messages.append(
-                # Tool Response Message
-                {
-                    "role": "tool",
-                    "call_id": op_msg.tool_calls[0].call_id,
-                    "content": tool_response["response"],
-                }
-            )
-        else:
-            actual_answer = op_msg.content.lower()
-            # pprint(actual_answer)
-            assert expected["answer"] in actual_answer
-
-        # sleep to avoid rate limit
-        sleep(1)
--- a/tests/integration/inference/test_vision_inference.py
+++ b/tests/integration/inference/test_vision_inference.py
@ -25,16 +25,19 @@ def base64_image_data(image_path):
    return base64.b64encode(image_path.read_bytes()).decode("utf-8")


+@pytest.fixture
+def base64_image_url(base64_image_data):
+    return f"data:image/png;base64,{base64_image_data}"
+
+
 def test_image_chat_completion_non_streaming(client_with_models, vision_model_id):
    message = {
        "role": "user",
        "content": [
            {
-                "type": "image",
-                "image": {
-                    "url": {
-                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
-                    },
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                },
            },
            {
@ -43,12 +46,12 @@ def test_image_chat_completion_non_streaming(client_with_models, vision_model_id
            },
        ],
    }
-    response = client_with_models.inference.chat_completion(
-        model_id=vision_model_id,
+    response = client_with_models.chat.completions.create(
+        model=vision_model_id,
        messages=[message],
        stream=False,
    )
-    message_content = response.completion_message.content.lower().strip()
+    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})

@ -68,8 +71,13 @@ def multi_image_data():
    return encoded_files


+@pytest.fixture
+def multi_image_url(multi_image_data):
+    return [f"data:image/jpeg;base64,{data}" for data in multi_image_data]
+
+
@pytest.mark.parametrize("stream", [True, False])
-def test_image_chat_completion_multiple_images(client_with_models, vision_model_id, multi_image_data, stream):
+def test_image_chat_completion_multiple_images(client_with_models, vision_model_id, multi_image_url, stream):
    supported_models = ["llama-4", "gpt-4o", "llama4"]
    if not any(model in vision_model_id.lower() for model in supported_models):
        pytest.skip(
@ -81,15 +89,15 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
            "role": "user",
            "content": [
                {
-                    "type": "image",
-                    "image": {
-                        "data": multi_image_data[0],
+                    "type": "image_url",
+                    "image_url": {
+                        "url": multi_image_url[0],
                    },
                },
                {
-                    "type": "image",
-                    "image": {
-                        "data": multi_image_data[1],
+                    "type": "image_url",
+                    "image_url": {
+                        "url": multi_image_url[1],
                    },
                },
                {
@ -99,17 +107,17 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
            ],
        },
    ]
-    response = client_with_models.inference.chat_completion(
-        model_id=vision_model_id,
+    response = client_with_models.chat.completions.create(
+        model=vision_model_id,
        messages=messages,
        stream=stream,
    )
    if stream:
        message_content = ""
        for chunk in response:
-            message_content += chunk.event.delta.text
+            message_content += chunk.choices[0].delta.content
    else:
-        message_content = response.completion_message.content
+        message_content = response.choices[0].message.content
    assert len(message_content) > 0
    assert any(expected in message_content.lower().strip() for expected in {"bedroom"}), message_content

@ -125,17 +133,17 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
            "role": "user",
            "content": [
                {
-                    "type": "image",
-                    "image": {
-                        "data": multi_image_data[2],
+                    "type": "image_url",
+                    "image_url": {
+                        "url": multi_image_data[2],
                    },
                },
                {"type": "text", "text": "How about this one?"},
            ],
        },
    )
-    response = client_with_models.inference.chat_completion(
-        model_id=vision_model_id,
+    response = client_with_models.chat.completions.create(
+        model=vision_model_id,
        messages=messages,
        stream=stream,
    )
@ -144,7 +152,7 @@ def test_image_chat_completion_multiple_images(client_with_models, vision_model_
        for chunk in response:
            message_content += chunk.event.delta.text
    else:
-        message_content = response.completion_message.content
+        message_content = response.choices[0].message.content
    assert len(message_content) > 0
    assert any(expected in message_content.lower().strip() for expected in {"sword", "shield"}), message_content

@ -154,11 +162,9 @@ def test_image_chat_completion_streaming(client_with_models, vision_model_id):
        "role": "user",
        "content": [
            {
-                "type": "image",
-                "image": {
-                    "url": {
-                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
-                    },
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                },
            },
            {
@ -167,23 +173,23 @@ def test_image_chat_completion_streaming(client_with_models, vision_model_id):
            },
        ],
    }
-    response = client_with_models.inference.chat_completion(
-        model_id=vision_model_id,
+    response = client_with_models.chat.completions.create(
+        model=vision_model_id,
        messages=[message],
        stream=True,
    )
    streamed_content = ""
    for chunk in response:
-        streamed_content += chunk.event.delta.text.lower()
+        streamed_content += chunk.choices[0].delta.content.lower()
    assert len(streamed_content) > 0
    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})


-def test_image_chat_completion_base64(client_with_models, vision_model_id, base64_image_data):
+def test_image_chat_completion_base64(client_with_models, vision_model_id, base64_image_url):
    image_spec = {
-        "type": "image",
-        "image": {
-            "data": base64_image_data,
+        "type": "image_url",
+        "image_url": {
+            "url": base64_image_url,
        },
    }

@ -197,10 +203,10 @@ def test_image_chat_completion_base64(client_with_models, vision_model_id, base6
            },
        ],
    }
-    response = client_with_models.inference.chat_completion(
-        model_id=vision_model_id,
+    response = client_with_models.chat.completions.create(
+        model=vision_model_id,
        messages=[message],
        stream=False,
    )
-    message_content = response.completion_message.content.lower().strip()
+    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
--- a/tests/integration/recordings/responses/239f4768f5aa.json
+++ b/tests/integration/recordings/responses/239f4768f5aa.json
@ -0,0 +1,89 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant. Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons."
+        },
+        {
+          "role": "user",
+          "content": "Please give me information about Michael Jordan."
+        }
+      ],
+      "response_format": {
+        "type": "json_schema",
+        "json_schema": {
+          "name": "AnswerFormat",
+          "schema": {
+            "properties": {
+              "first_name": {
+                "title": "First Name",
+                "type": "string"
+              },
+              "last_name": {
+                "title": "Last Name",
+                "type": "string"
+              },
+              "year_of_birth": {
+                "title": "Year Of Birth",
+                "type": "integer"
+              }
+            },
+            "required": [
+              "first_name",
+              "last_name",
+              "year_of_birth"
+            ],
+            "title": "AnswerFormat",
+            "type": "object"
+          }
+        }
+      },
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-433",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "{\"first_name\": \"Michael\", \"last_name\": \"Jordan\", \"year_of_birth\": 1963}\n\n   \t\t\t\t\t\t\t\t\t\t\t \t\t   ",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758979490,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 31,
+          "prompt_tokens": 60,
+          "total_tokens": 91,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/2a5a4e821bc8.json
+++ b/tests/integration/recordings/responses/2a5a4e821bc8.json
@ -0,0 +1,44 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "prompt": "Hello, world!",
+      "logprobs": false,
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "cmpl-74",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "text": "Hello! How can I assist you today?"
+          }
+        ],
+        "created": 1758975636,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "text_completion",
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 10,
+          "prompt_tokens": 29,
+          "total_tokens": 39,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/2fef6eda9cd7.json
+++ b/tests/integration/recordings/responses/2fef6eda9cd7.json
--- a/tests/integration/recordings/responses/38ea441b5f83.json
+++ b/tests/integration/recordings/responses/38ea441b5f83.json
@ -0,0 +1,92 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "Pretend you are a weather assistant."
+        },
+        {
+          "role": "user",
+          "content": "What's the weather like in San Francisco, CA?"
+        }
+      ],
+      "stream": false,
+      "tool_choice": "auto",
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": [
+                "location"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-761",
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": [
+                {
+                  "id": "call_cj8ownwc",
+                  "function": {
+                    "arguments": "{\"location\":\"San Francisco, CA\"}",
+                    "name": "get_weather"
+                  },
+                  "type": "function",
+                  "index": 0
+                }
+              ]
+            }
+          }
+        ],
+        "created": 1758975113,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 18,
+          "prompt_tokens": 185,
+          "total_tokens": 203,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/5b2088233334.json
+++ b/tests/integration/recordings/responses/5b2088233334.json
@ -0,0 +1,44 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "prompt": "Hello, world!",
+      "logprobs": true,
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "cmpl-809",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "text": "Hello! It's nice to meet you. Is there anything I can help you with or would you like to chat?"
+          }
+        ],
+        "created": 1758975633,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "text_completion",
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 25,
+          "prompt_tokens": 29,
+          "total_tokens": 54,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/65c12de0a1db.json
+++ b/tests/integration/recordings/responses/65c12de0a1db.json
@ -0,0 +1,60 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "Pretend you are a weather assistant."
+        },
+        {
+          "role": "user",
+          "content": "What's the weather like in San Francisco, CA?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-123",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Hello! As of my knowledge cutoff on December 15th, I have the latest information for you. However, please note that my data may not be entirely up-to-date.\n\nCurrently, and based on historical climate patterns, it appears to be a partly cloudy day with mild temperatures in San Francisco, CA. Expect a temperature range of around 48\u00b0F (9\u00b0C) to 54\u00b0F (12\u00b0C). It's likely to be a breezy day, with winds blowing at about 13 mph (21 km/h).\n\nHowever, if I were to look into more recent weather patterns or forecasts, I would recommend checking the latest conditions directly from reliable sources such as the National Weather Service or local news outlets for more accurate and up-to-date information.\n\nPlease let me know how I can further assist you.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758978071,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 163,
+          "prompt_tokens": 45,
+          "total_tokens": 208,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/a369881bb3a2.json
+++ b/tests/integration/recordings/responses/a369881bb3a2.json
@ -0,0 +1,55 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test trace 0"
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-272",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I'm happy to help you with a test. Since we are in the middle of a text-based conversation, I'll do my best to simulate a simple test tracing process.\n\n**Trace Test Results**\n\nTo perform this test, please follow these steps:\n\n1. Type \"test\" on command mode.\n2. Press Enter.\n\nNow, let's start tracing...\n\nTest Tracing Results:\nTest Case: General Functions\nTest Case Result: PASS\n\nSystem Response:\n\n```\n# System Boot Time: 2023-10-13T14:30:00\n# CPU Temperature: 35\u00b0C\n# Disk Space Available: 80%\n```\n\nNext Steps?\n\nType 'done' to exit the test, or 'run' for more tests.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758978134,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 152,
+          "prompt_tokens": 29,
+          "total_tokens": 181,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/a46b77ffd494.json
+++ b/tests/integration/recordings/responses/a46b77ffd494.json
@ -0,0 +1,44 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "prompt": "Return the exact same sentence and don't add additional words): Michael Jordan was born in the year of 1963",
+      "stop": "1963",
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "cmpl-183",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "text": "Michael Jordan was born in the year of "
+          }
+        ],
+        "created": 1758978053,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "text_completion",
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 11,
+          "prompt_tokens": 48,
+          "total_tokens": 59,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/c3dbccc5de74.json
+++ b/tests/integration/recordings/responses/c3dbccc5de74.json
@ -0,0 +1,112 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "Pretend you are a weather assistant."
+        },
+        {
+          "role": "user",
+          "content": "What's the weather like in San Francisco, CA?"
+        }
+      ],
+      "stream": true,
+      "tool_choice": "auto",
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": [
+                "location"
+              ]
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-634",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_wubm4yax",
+                    "function": {
+                      "arguments": "{\"location\":\"San Francisco, CA\"}",
+                      "name": "get_weather"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758975115,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-634",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "tool_calls",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758975115,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/c8e196049fe4.json
+++ b/tests/integration/recordings/responses/c8e196049fe4.json
@ -0,0 +1,47 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "prompt": "Return the exact same sentence and don't add additional words): Michael Jordan was born in the year of 1963",
+      "stop": [
+        "blathering",
+        "1963"
+      ],
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "cmpl-381",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "text": "Michael Jordan was born in the year of "
+          }
+        ],
+        "created": 1758978056,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "text_completion",
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 11,
+          "prompt_tokens": 48,
+          "total_tokens": 59,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/cb1099daed49.json
+++ b/tests/integration/recordings/responses/cb1099daed49.json
@ -0,0 +1,55 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test trace 1"
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-122",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "It appears you're trying to initiate a conversation or test the functionality of this AI system. I'm happy to chat with you!\n\nWould you like to:\nA) Ask me a question on a specific topic\nB) Engage in a conversational dialogue on a topic of your choice\nC) Play a text-based game\nD) Test my language understanding capabilities\n\nPlease respond with the letter of your preferred activity.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1758978142,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 85,
+          "prompt_tokens": 29,
+          "total_tokens": 114,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/d0ac68cbde69.json
+++ b/tests/integration/recordings/responses/d0ac68cbde69.json
@ -11,26 +11,7 @@
    "body": {
      "__type__": "ollama._types.ProcessResponse",
      "__data__": {
-        "models": [
-          {
-            "model": "llama3.2-vision:11b",
-            "name": "llama3.2-vision:11b",
-            "digest": "6f2f9757ae97e8a3f8ea33d6adb2b11d93d9a35bef277cd2c0b1b5af8e8d0b1e",
-            "expires_at": "2025-09-03T11:51:35.966409-07:00",
-            "size": 12401209008,
-            "size_vram": 12401209008,
-            "details": {
-              "parent_model": "",
-              "format": "gguf",
-              "family": "mllama",
-              "families": [
-                "mllama"
-              ],
-              "parameter_size": "10.7B",
-              "quantization_level": "Q4_K_M"
-            }
-          }
-        ]
+        "models": []
      }
    },
    "is_streaming": false
--- a/tests/integration/recordings/responses/d927b47032de.json
+++ b/tests/integration/recordings/responses/d927b47032de.json
@ -31,14 +31,14 @@
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
-        "id": "chatcmpl-489",
+        "id": "chatcmpl-51",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
-              "content": "The image is a photograph of a young golden retriever puppy from the chest up. The puppy faces the camera and its tongue is out of its mouth, as if to say hello. It appears to be between 1 and 3 months of age. It is a fluffy little golden retriever puppy with very little fat. Its fur is light blond and very fluffy. It has a small, round black nose. It is in front of a blurry background of warm yellows and greys.",
+              "content": "The image features a close-up of a golden retriever puppy with its mouth agape. The puppy has cream-color fur with golden patches on its big ears, which are held slightly out to the sides. Its dark eyes appear black from across a great distance, while the black nose is surrounded by white fur. The puppy's mouth is wide open, revealing a healthy pink tongue and what appears to be a green leaf (likely a blade of grass) stuck in its mouth. The puppy is facing the camera directly, with its paws pressed up tight against its body. The puppy is sitting in an open field with a golden brown grass carpet. The puppy appears happy. The image is well-compressed with great digital sharpness.",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
@ -48,15 +48,15 @@
            }
          }
        ],
-        "created": 1758461767,
+        "created": 1756724768,
        "model": "llama3.2-vision:11b",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
-          "completion_tokens": 100,
+          "completion_tokens": 147,
          "prompt_tokens": 18,
-          "total_tokens": 118,
+          "total_tokens": 165,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
--- a/tests/integration/telemetry/test_telemetry.py
+++ b/tests/integration/telemetry/test_telemetry.py
@ -32,8 +32,8 @@ def setup_telemetry_data(llama_stack_client, text_model_id):
        )

    for i in range(2):
-        llama_stack_client.inference.chat_completion(
-            model_id=text_model_id, messages=[{"role": "user", "content": f"Test trace {i}"}]
+        llama_stack_client.chat.completions.create(
+            model=text_model_id, messages=[{"role": "user", "content": f"Test trace {i}"}]
        )

    start_time = time.time()
--- a/tests/integration/test_cases/inference/chat_completion.json
+++ b/tests/integration/test_cases/inference/chat_completion.json
@ -83,12 +83,19 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
-          "description": "Get the current weather",
-          "parameters": {
-            "location": {
-              "param_type": "string",
-              "description": "The city and state (both required), e.g. San Francisco, CA."
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": ["location"]
            }
          }
        }
@ -116,12 +123,19 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
-          "description": "Get the current weather",
-          "parameters": {
-            "location": {
-              "param_type": "string",
-              "description": "The city and state (both required), e.g. San Francisco, CA."
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": ["location"]
            }
          }
        }
@ -162,12 +176,19 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
-          "description": "Get the current weather",
-          "parameters": {
-            "location": {
-              "param_type": "string",
-              "description": "The city and state (both required), e.g. San Francisco, CA."
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": ["location"]
            }
          }
        }
@ -192,66 +213,6 @@
      ]
    }
  },
-  "array_parameter": {
-    "data": {
-      "messages": [
-        [
-          {
-            "role": "user",
-            "content": "Please add a new product with name 'Widget', price 19.99, in stock, and tags ['new', 'sale'] and give me the product id."
-          }
-        ]
-      ],
-      "tools": [
-        {
-          "tool_name": "addProduct",
-          "description": "Get the current weather",
-          "parameters": {
-            "name": {
-              "param_type": "string",
-              "description": "Name of the product"
-            },
-            "price": {
-              "param_type": "number",
-              "description": "Price of the product"
-            },
-            "inStock": {
-              "param_type": "boolean",
-              "description": "Availability status of the product."
-            },
-            "tags": {
-              "param_type": "list[str]",
-              "description": "List of product tags"
-            }
-          }
-        }
-      ],
-      "tool_responses": [
-        {
-          "response": "{'response': 'Successfully added product with id: 123'}"
-        }
-      ],
-      "expected": [
-        {
-          "num_tool_calls": 1,
-          "tool_name": "addProduct",
-          "tool_arguments": {
-            "name": "Widget",
-            "price": 19.99,
-            "inStock": true,
-            "tags": [
-              "new",
-              "sale"
-            ]
-          }
-        },
-        {
-          "num_tool_calls": 0,
-          "answer": "123"
-        }
-      ]
-    }
-  },
  "sample_messages_tool_calling": {
    "data": {
      "messages": [
@ -270,13 +231,19 @@
      ],
      "tools": [
        {
-          "tool_name": "get_weather",
-          "description": "Get the current weather",
-          "parameters": {
-            "location": {
-              "param_type": "string",
-              "description": "The city and state, e.g. San Francisco, CA",
-              "required": true
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the current weather",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "location": {
+                  "type": "string",
+                  "description": "The city and state (both required), e.g. San Francisco, CA."
+                }
+              },
+              "required": ["location"]
            }
          }
        }
@ -343,18 +310,23 @@
      ],
      "tools": [
        {
-          "tool_name": "get_object_namespace_list",
-          "description": "Get the list of objects in a namespace",
-          "parameters": {
-            "kind": {
-              "param_type": "string",
-              "description": "the type of object",
-              "required": true
-            },
-            "namespace": {
-              "param_type": "string",
-              "description": "the name of the namespace",
-              "required": true
+          "type": "function",
+          "function": {
+            "name": "get_object_namespace_list",
+            "description": "Get the list of objects in a namespace",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "kind": {
+                  "type": "string",
+                  "description": "the type of object"
+                },
+                "namespace": {
+                  "type": "string",
+                  "description": "the name of the namespace"
+                }
+              },
+              "required": ["kind", "namespace"]
            }
          }
        }