refactor: tests/unittests -> tests/unit; tests/api -> tests/integration

2025-03-04 09:55:05 -08:00 · 2025-03-04 09:55:05 -08:00 · 4ca58eb987
commit 4ca58eb987
parent c6b13b6a24
33 changed files with 0 additions and 0 deletions
--- a/tests/integration/inference/init.py
+++ b/tests/integration/inference/init.py
@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+# ruff: noqa: N999
--- a/tests/integration/inference/dog.png
+++ b/tests/integration/inference/dog.png
--- a/tests/integration/inference/test_embedding.py
+++ b/tests/integration/inference/test_embedding.py
@ -0,0 +1,292 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+#
+# Test plan:
+#
+#  Types of input:
+#   - array of a string
+#   - array of a image (ImageContentItem, either URL or base64 string)
+#   - array of a text (TextContentItem)
+#  Types of output:
+#   - list of list of floats
+#  Params:
+#   - text_truncation
+#     - absent w/ long text -> error
+#     - none w/ long text -> error
+#     - absent w/ short text -> ok
+#     - none w/ short text -> ok
+#     - end w/ long text -> ok
+#     - end w/ short text -> ok
+#     - start w/ long text -> ok
+#     - start w/ short text -> ok
+#   - output_dimension
+#     - response dimension matches
+#   - task_type, only for asymmetric models
+#     - query embedding != passage embedding
+#  Negative:
+#   - long string
+#   - long text
+#
+# Todo:
+#  - negative tests
+#    - empty
+#      - empty list
+#      - empty string
+#      - empty text
+#      - empty image
+#    - long
+#      - large image
+#      - appropriate combinations
+#    - batch size
+#      - many inputs
+#    - invalid
+#      - invalid URL
+#      - invalid base64
+#
+# Notes:
+#  - use llama_stack_client fixture
+#  - use pytest.mark.parametrize when possible
+#  - no accuracy tests: only check the type of output, not the content
+#
+
+import pytest
+from llama_stack_client import BadRequestError
+from llama_stack_client.types import EmbeddingsResponse
+from llama_stack_client.types.shared.interleaved_content import (
+    ImageContentItem,
+    ImageContentItemImage,
+    ImageContentItemImageURL,
+    TextContentItem,
+)
+
+DUMMY_STRING = "hello"
+DUMMY_STRING2 = "world"
+DUMMY_LONG_STRING = "NVDA " * 10240
+DUMMY_TEXT = TextContentItem(text=DUMMY_STRING, type="text")
+DUMMY_TEXT2 = TextContentItem(text=DUMMY_STRING2, type="text")
+DUMMY_LONG_TEXT = TextContentItem(text=DUMMY_LONG_STRING, type="text")
+# TODO(mf): add a real image URL and base64 string
+DUMMY_IMAGE_URL = ImageContentItem(
+    image=ImageContentItemImage(url=ImageContentItemImageURL(uri="https://example.com/image.jpg")), type="image"
+)
+DUMMY_IMAGE_BASE64 = ImageContentItem(image=ImageContentItemImage(data="base64string"), type="image")
+SUPPORTED_PROVIDERS = {"remote::nvidia"}
+MODELS_SUPPORTING_MEDIA = {}
+MODELS_SUPPORTING_OUTPUT_DIMENSION = {"nvidia/llama-3.2-nv-embedqa-1b-v2"}
+MODELS_REQUIRING_TASK_TYPE = {
+    "nvidia/llama-3.2-nv-embedqa-1b-v2",
+    "nvidia/nv-embedqa-e5-v5",
+    "nvidia/nv-embedqa-mistral-7b-v2",
+    "snowflake/arctic-embed-l",
+}
+MODELS_SUPPORTING_TASK_TYPE = MODELS_REQUIRING_TASK_TYPE
+
+
+def default_task_type(model_id):
+    """
+    Some models require a task type parameter. This provides a default value for
+    testing those models.
+    """
+    if model_id in MODELS_REQUIRING_TASK_TYPE:
+        return {"task_type": "query"}
+    return {}
+
+
+@pytest.mark.parametrize(
+    "contents",
+    [
+        [DUMMY_STRING, DUMMY_STRING2],
+        [DUMMY_TEXT, DUMMY_TEXT2],
+    ],
+    ids=[
+        "list[string]",
+        "list[text]",
+    ],
+)
+def test_embedding_text(llama_stack_client, embedding_model_id, contents, inference_provider_type):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
+    response = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
+    )
+    assert isinstance(response, EmbeddingsResponse)
+    assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
+    assert isinstance(response.embeddings[0], list)
+    assert isinstance(response.embeddings[0][0], float)
+
+
+@pytest.mark.parametrize(
+    "contents",
+    [
+        [DUMMY_IMAGE_URL, DUMMY_IMAGE_BASE64],
+        [DUMMY_IMAGE_URL, DUMMY_STRING, DUMMY_IMAGE_BASE64, DUMMY_TEXT],
+    ],
+    ids=[
+        "list[url,base64]",
+        "list[url,string,base64,text]",
+    ],
+)
+def test_embedding_image(llama_stack_client, embedding_model_id, contents, inference_provider_type):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
+    if embedding_model_id not in MODELS_SUPPORTING_MEDIA:
+        pytest.xfail(f"{embedding_model_id} doesn't support media")
+    response = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id, contents=contents, **default_task_type(embedding_model_id)
+    )
+    assert isinstance(response, EmbeddingsResponse)
+    assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents)
+    assert isinstance(response.embeddings[0], list)
+    assert isinstance(response.embeddings[0][0], float)
+
+
+@pytest.mark.parametrize(
+    "text_truncation",
+    [
+        "end",
+        "start",
+    ],
+)
+@pytest.mark.parametrize(
+    "contents",
+    [
+        [DUMMY_LONG_TEXT],
+        [DUMMY_STRING],
+    ],
+    ids=[
+        "long",
+        "short",
+    ],
+)
+def test_embedding_truncation(
+    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
+):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
+    response = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id,
+        contents=contents,
+        text_truncation=text_truncation,
+        **default_task_type(embedding_model_id),
+    )
+    assert isinstance(response, EmbeddingsResponse)
+    assert len(response.embeddings) == 1
+    assert isinstance(response.embeddings[0], list)
+    assert isinstance(response.embeddings[0][0], float)
+
+
+@pytest.mark.parametrize(
+    "text_truncation",
+    [
+        None,
+        "none",
+    ],
+)
+@pytest.mark.parametrize(
+    "contents",
+    [
+        [DUMMY_LONG_TEXT],
+        [DUMMY_LONG_STRING],
+    ],
+    ids=[
+        "long-text",
+        "long-str",
+    ],
+)
+def test_embedding_truncation_error(
+    llama_stack_client, embedding_model_id, text_truncation, contents, inference_provider_type
+):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
+    with pytest.raises(BadRequestError):
+        llama_stack_client.inference.embeddings(
+            model_id=embedding_model_id,
+            contents=[DUMMY_LONG_TEXT],
+            text_truncation=text_truncation,
+            **default_task_type(embedding_model_id),
+        )
+
+
+def test_embedding_output_dimension(llama_stack_client, embedding_model_id, inference_provider_type):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
+    if embedding_model_id not in MODELS_SUPPORTING_OUTPUT_DIMENSION:
+        pytest.xfail(f"{embedding_model_id} doesn't support output_dimension")
+    base_response = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id, contents=[DUMMY_STRING], **default_task_type(embedding_model_id)
+    )
+    test_response = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id,
+        contents=[DUMMY_STRING],
+        **default_task_type(embedding_model_id),
+        output_dimension=32,
+    )
+    assert len(base_response.embeddings[0]) != len(test_response.embeddings[0])
+    assert len(test_response.embeddings[0]) == 32
+
+
+def test_embedding_task_type(llama_stack_client, embedding_model_id, inference_provider_type):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
+    if embedding_model_id not in MODELS_SUPPORTING_TASK_TYPE:
+        pytest.xfail(f"{embedding_model_id} doesn't support task_type")
+    query_embedding = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="query"
+    )
+    document_embedding = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id, contents=[DUMMY_STRING], task_type="document"
+    )
+    assert query_embedding.embeddings != document_embedding.embeddings
+
+
+@pytest.mark.parametrize(
+    "text_truncation",
+    [
+        None,
+        "none",
+        "end",
+        "start",
+    ],
+)
+def test_embedding_text_truncation(llama_stack_client, embedding_model_id, text_truncation, inference_provider_type):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
+    response = llama_stack_client.inference.embeddings(
+        model_id=embedding_model_id,
+        contents=[DUMMY_STRING],
+        text_truncation=text_truncation,
+        **default_task_type(embedding_model_id),
+    )
+    assert isinstance(response, EmbeddingsResponse)
+    assert len(response.embeddings) == 1
+    assert isinstance(response.embeddings[0], list)
+    assert isinstance(response.embeddings[0][0], float)
+
+
+@pytest.mark.parametrize(
+    "text_truncation",
+    [
+        "NONE",
+        "END",
+        "START",
+        "left",
+        "right",
+    ],
+)
+def test_embedding_text_truncation_error(
+    llama_stack_client, embedding_model_id, text_truncation, inference_provider_type
+):
+    if inference_provider_type not in SUPPORTED_PROVIDERS:
+        pytest.xfail(f"{inference_provider_type} doesn't support embedding model yet")
+    with pytest.raises(BadRequestError):
+        llama_stack_client.inference.embeddings(
+            model_id=embedding_model_id,
+            contents=[DUMMY_STRING],
+            text_truncation=text_truncation,
+            **default_task_type(embedding_model_id),
+        )
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -0,0 +1,412 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+import pytest
+from pydantic import BaseModel
+
+from llama_stack.models.llama.sku_list import resolve_model
+from llama_stack.providers.tests.test_cases.test_case import TestCase
+
+PROVIDER_LOGPROBS_TOP_K = {"remote::together", "remote::fireworks", "remote::vllm"}
+
+
+def skip_if_model_doesnt_support_completion(client_with_models, model_id):
+    models = {m.identifier: m for m in client_with_models.models.list()}
+    provider_id = models[model_id].provider_id
+    providers = {p.provider_id: p for p in client_with_models.providers.list()}
+    provider = providers[provider_id]
+    if provider.provider_type in ("remote::openai", "remote::anthropic", "remote::gemini", "remote::groq"):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support completion")
+
+
+def get_llama_model(client_with_models, model_id):
+    models = {}
+    for m in client_with_models.models.list():
+        models[m.identifier] = m
+        models[m.provider_resource_id] = m
+
+    assert model_id in models, f"Model {model_id} not found"
+
+    model = models[model_id]
+    ids = (model.identifier, model.provider_resource_id)
+    for mid in ids:
+        if resolve_model(mid):
+            return mid
+
+    return model.metadata.get("llama_model", None)
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:sanity",
+    ],
+)
+def test_text_completion_non_streaming(client_with_models, text_model_id, test_case):
+    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.completion(
+        content=tc["content"],
+        stream=False,
+        model_id=text_model_id,
+        sampling_params={
+            "max_tokens": 50,
+        },
+    )
+    assert len(response.content) > 10
+    # assert "blue" in response.content.lower().strip()
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:sanity",
+    ],
+)
+def test_text_completion_streaming(client_with_models, text_model_id, test_case):
+    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.completion(
+        content=tc["content"],
+        stream=True,
+        model_id=text_model_id,
+        sampling_params={
+            "max_tokens": 50,
+        },
+    )
+    streamed_content = [chunk.delta for chunk in response]
+    content_str = "".join(streamed_content).lower().strip()
+    # assert "blue" in content_str
+    assert len(content_str) > 10
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:log_probs",
+    ],
+)
+def test_text_completion_log_probs_non_streaming(client_with_models, text_model_id, inference_provider_type, test_case):
+    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
+    if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K:
+        pytest.xfail(f"{inference_provider_type} doesn't support log probs yet")
+
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.completion(
+        content=tc["content"],
+        stream=False,
+        model_id=text_model_id,
+        sampling_params={
+            "max_tokens": 5,
+        },
+        logprobs={
+            "top_k": 1,
+        },
+    )
+    assert response.logprobs, "Logprobs should not be empty"
+    assert 1 <= len(response.logprobs) <= 5  # each token has 1 logprob and here max_tokens=5
+    assert all(len(logprob.logprobs_by_token) == 1 for logprob in response.logprobs)
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:log_probs",
+    ],
+)
+def test_text_completion_log_probs_streaming(client_with_models, text_model_id, inference_provider_type, test_case):
+    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
+    if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K:
+        pytest.xfail(f"{inference_provider_type} doesn't support log probs yet")
+
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.completion(
+        content=tc["content"],
+        stream=True,
+        model_id=text_model_id,
+        sampling_params={
+            "max_tokens": 5,
+        },
+        logprobs={
+            "top_k": 1,
+        },
+    )
+    streamed_content = list(response)
+    for chunk in streamed_content:
+        if chunk.delta:  # if there's a token, we expect logprobs
+            assert chunk.logprobs, "Logprobs should not be empty"
+            assert all(len(logprob.logprobs_by_token) == 1 for logprob in chunk.logprobs)
+        else:  # no token, no logprobs
+            assert not chunk.logprobs, "Logprobs should be empty"
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:completion:structured_output",
+    ],
+)
+def test_text_completion_structured_output(client_with_models, text_model_id, test_case):
+    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
+
+    class AnswerFormat(BaseModel):
+        name: str
+        year_born: str
+        year_retired: str
+
+    tc = TestCase(test_case)
+
+    user_input = tc["user_input"]
+    response = client_with_models.inference.completion(
+        model_id=text_model_id,
+        content=user_input,
+        stream=False,
+        sampling_params={
+            "max_tokens": 50,
+        },
+        response_format={
+            "type": "json_schema",
+            "json_schema": AnswerFormat.model_json_schema(),
+        },
+    )
+    answer = AnswerFormat.model_validate_json(response.content)
+    expected = tc["expected"]
+    assert answer.name == expected["name"]
+    assert answer.year_born == expected["year_born"]
+    assert answer.year_retired == expected["year_retired"]
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:non_streaming_01",
+        "inference:chat_completion:non_streaming_02",
+    ],
+)
+def test_text_chat_completion_non_streaming(client_with_models, text_model_id, test_case):
+    tc = TestCase(test_case)
+    question = tc["question"]
+    expected = tc["expected"]
+
+    response = client_with_models.inference.chat_completion(
+        model_id=text_model_id,
+        messages=[
+            {
+                "role": "user",
+                "content": question,
+            }
+        ],
+        stream=False,
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0
+    assert expected.lower() in message_content
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:streaming_01",
+        "inference:chat_completion:streaming_02",
+    ],
+)
+def test_text_chat_completion_streaming(client_with_models, text_model_id, test_case):
+    tc = TestCase(test_case)
+    question = tc["question"]
+    expected = tc["expected"]
+
+    response = client_with_models.inference.chat_completion(
+        model_id=text_model_id,
+        messages=[{"role": "user", "content": question}],
+        stream=True,
+    )
+    streamed_content = [str(chunk.event.delta.text.lower().strip()) for chunk in response]
+    assert len(streamed_content) > 0
+    assert expected.lower() in "".join(streamed_content)
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling",
+    ],
+)
+def test_text_chat_completion_with_tool_calling_and_non_streaming(client_with_models, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.chat_completion(
+        model_id=text_model_id,
+        messages=tc["messages"],
+        tools=tc["tools"],
+        tool_choice="auto",
+        stream=False,
+    )
+    # some models can return content for the response in addition to the tool call
+    assert response.completion_message.role == "assistant"
+
+    assert len(response.completion_message.tool_calls) == 1
+    assert response.completion_message.tool_calls[0].tool_name == tc["tools"][0]["tool_name"]
+    assert response.completion_message.tool_calls[0].arguments == tc["expected"]
+
+
+# Will extract streamed text and separate it from tool invocation content
+# The returned tool inovcation content will be a string so it's easy to comapare with expected value
+# e.g. "[get_weather, {'location': 'San Francisco, CA'}]"
+def extract_tool_invocation_content(response):
+    tool_invocation_content: str = ""
+    for chunk in response:
+        delta = chunk.event.delta
+        if delta.type == "tool_call" and delta.parse_status == "succeeded":
+            call = delta.tool_call
+            tool_invocation_content += f"[{call.tool_name}, {call.arguments}]"
+    return tool_invocation_content
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling",
+    ],
+)
+def test_text_chat_completion_with_tool_calling_and_streaming(client_with_models, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.chat_completion(
+        model_id=text_model_id,
+        messages=tc["messages"],
+        tools=tc["tools"],
+        tool_choice="auto",
+        stream=True,
+    )
+    tool_invocation_content = extract_tool_invocation_content(response)
+    expected_tool_name = tc["tools"][0]["tool_name"]
+    expected_argument = tc["expected"]
+    assert tool_invocation_content == f"[{expected_tool_name}, {expected_argument}]"
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling",
+    ],
+)
+def test_text_chat_completion_with_tool_choice_required(client_with_models, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.chat_completion(
+        model_id=text_model_id,
+        messages=tc["messages"],
+        tools=tc["tools"],
+        tool_config={
+            "tool_choice": "required",
+        },
+        stream=True,
+    )
+    tool_invocation_content = extract_tool_invocation_content(response)
+    expected_tool_name = tc["tools"][0]["tool_name"]
+    expected_argument = tc["expected"]
+    assert tool_invocation_content == f"[{expected_tool_name}, {expected_argument}]"
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling",
+    ],
+)
+def test_text_chat_completion_with_tool_choice_none(client_with_models, text_model_id, test_case):
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.chat_completion(
+        model_id=text_model_id,
+        messages=tc["messages"],
+        tools=tc["tools"],
+        tool_config={"tool_choice": "none"},
+        stream=True,
+    )
+    tool_invocation_content = extract_tool_invocation_content(response)
+    assert tool_invocation_content == ""
+
+
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:structured_output",
+    ],
+)
+def test_text_chat_completion_structured_output(client_with_models, text_model_id, test_case):
+    class NBAStats(BaseModel):
+        year_for_draft: int
+        num_seasons_in_nba: int
+
+    class AnswerFormat(BaseModel):
+        first_name: str
+        last_name: str
+        year_of_birth: int
+        nba_stats: NBAStats
+
+    tc = TestCase(test_case)
+
+    response = client_with_models.inference.chat_completion(
+        model_id=text_model_id,
+        messages=tc["messages"],
+        response_format={
+            "type": "json_schema",
+            "json_schema": AnswerFormat.model_json_schema(),
+        },
+        stream=False,
+    )
+    answer = AnswerFormat.model_validate_json(response.completion_message.content)
+    expected = tc["expected"]
+    assert answer.first_name == expected["first_name"]
+    assert answer.last_name == expected["last_name"]
+    assert answer.year_of_birth == expected["year_of_birth"]
+    assert answer.nba_stats.num_seasons_in_nba == expected["num_seasons_in_nba"]
+    assert answer.nba_stats.year_for_draft == expected["year_for_draft"]
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+@pytest.mark.parametrize(
+    "test_case",
+    [
+        "inference:chat_completion:tool_calling_tools_absent",
+    ],
+)
+def test_text_chat_completion_tool_calling_tools_not_in_request(
+    client_with_models, text_model_id, test_case, streaming
+):
+    tc = TestCase(test_case)
+
+    # TODO: more dynamic lookup on tool_prompt_format for model family
+    tool_prompt_format = "json" if "3.1" in text_model_id else "python_list"
+    request = {
+        "model_id": text_model_id,
+        "messages": tc["messages"],
+        "tools": tc["tools"],
+        "tool_choice": "auto",
+        "tool_prompt_format": tool_prompt_format,
+        "stream": streaming,
+    }
+
+    response = client_with_models.inference.chat_completion(**request)
+
+    if streaming:
+        for chunk in response:
+            delta = chunk.event.delta
+            if delta.type == "tool_call" and delta.parse_status == "succeeded":
+                assert delta.tool_call.tool_name == "get_object_namespace_list"
+            if delta.type == "tool_call" and delta.parse_status == "failed":
+                # expect raw message that failed to parse in tool_call
+                assert isinstance(delta.tool_call, str)
+                assert len(delta.tool_call) > 0
+    else:
+        for tc in response.completion_message.tool_calls:
+            assert tc.tool_name == "get_object_namespace_list"
--- a/tests/integration/inference/test_vision_inference.py
+++ b/tests/integration/inference/test_vision_inference.py
@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+import pathlib
+
+import pytest
+
+
+@pytest.fixture
+def image_path():
+    return pathlib.Path(__file__).parent / "dog.png"
+
+
+@pytest.fixture
+def base64_image_data(image_path):
+    # Convert the image to base64
+    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
+
+
+@pytest.fixture
+def base64_image_url(base64_image_data, image_path):
+    # suffix includes the ., so we remove it
+    return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"
+
+
+def test_image_chat_completion_non_streaming(client_with_models, vision_model_id):
+    message = {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": {
+                    "url": {
+                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/api/inference/dog.png"
+                    },
+                },
+            },
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = client_with_models.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=False,
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0
+    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
+
+
+def test_image_chat_completion_streaming(client_with_models, vision_model_id):
+    message = {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": {
+                    "url": {
+                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/api/inference/dog.png"
+                    },
+                },
+            },
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = client_with_models.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=True,
+    )
+    streamed_content = ""
+    for chunk in response:
+        streamed_content += chunk.event.delta.text.lower()
+    assert len(streamed_content) > 0
+    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})
+
+
+@pytest.mark.parametrize("type_", ["url", "data"])
+def test_image_chat_completion_base64(client_with_models, vision_model_id, base64_image_data, base64_image_url, type_):
+    image_spec = {
+        "url": {
+            "type": "image",
+            "image": {
+                "url": {
+                    "uri": base64_image_url,
+                },
+            },
+        },
+        "data": {
+            "type": "image",
+            "image": {
+                "data": base64_image_data,
+            },
+        },
+    }[type_]
+
+    message = {
+        "role": "user",
+        "content": [
+            image_spec,
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = client_with_models.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=False,
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0