Update the "InterleavedTextMedia" type (#635)

## What does this PR do? This is a long-pending change and particularly important to get done now. Specifically: - we cannot "localize" (aka download) any URLs from media attachments anywhere near our modeling code. it must be done within llama-stack. - `PIL.Image` is infesting all our APIs via `ImageMedia -> InterleavedTextMedia` and that cannot be right at all. Anything in the API surface must be "naturally serializable". We need a standard `{ type: "image", image_url: "<...>" }` which is more extensible - `UserMessage`, `SystemMessage`, etc. are moved completely to llama-stack from the llama-models repository. See https://github.com/meta-llama/llama-models/pull/244 for the corresponding PR in llama-models. ## Test Plan ```bash cd llama_stack/providers/tests pytest -s -v -k "fireworks or ollama or together" inference/test_vision_inference.py pytest -s -v -k "(fireworks or ollama or together) and llama_3b" inference/test_text_inference.py pytest -s -v -k chroma memory/test_memory.py \ --env EMBEDDING_DIMENSION=384 --env CHROMA_DB_PATH=/tmp/foobar pytest -s -v -k fireworks agents/test_agents.py \ --safety-shield=meta-llama/Llama-Guard-3-8B \ --inference-model=meta-llama/Llama-3.1-8B-Instruct ``` Updated the client sdk (see PR ...), installed the SDK in the same environment and then ran the SDK tests: ```bash cd tests/client-sdk LLAMA_STACK_CONFIG=together pytest -s -v agents/test_agents.py LLAMA_STACK_CONFIG=ollama pytest -s -v memory/test_memory.py # this one needed a bit of hacking in the run.yaml to ensure I could register the vision model correctly INFERENCE_MODEL=llama3.2-vision:latest LLAMA_STACK_CONFIG=ollama pytest -s -v inference/test_inference.py ```
2025-12-03 09:53:45 +00:00 · 2024-12-17 11:18:31 -08:00 · 2024-12-17 11:18:31 -08:00 · 8de8eb03c8
commit 8de8eb03c8
parent 10eb31badf
66 changed files with 1344 additions and 1801 deletions
--- a/llama_stack/apis/common/content_types.py
+++ b/llama_stack/apis/common/content_types.py
@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Annotated, List, Literal, Optional, Union
+
+from llama_models.schema_utils import json_schema_type, register_schema
+
+from pydantic import BaseModel, Field, model_validator
+
+
+@json_schema_type(
+    schema={"type": "string", "format": "uri", "pattern": "^(https?://|file://|data:)"}
+)
+class URL(BaseModel):
+    uri: str
+
+    def __str__(self) -> str:
+        return self.uri
+
+
+class _URLOrData(BaseModel):
+    url: Optional[URL] = None
+    data: Optional[bytes] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def validator(cls, values):
+        if isinstance(values, dict):
+            return values
+        return {"url": values}
+
+
+@json_schema_type
+class ImageContentItem(_URLOrData):
+    type: Literal["image"] = "image"
+
+
+@json_schema_type
+class TextContentItem(BaseModel):
+    type: Literal["text"] = "text"
+    text: str
+
+
+# other modalities can be added here
+InterleavedContentItem = register_schema(
+    Annotated[
+        Union[ImageContentItem, TextContentItem],
+        Field(discriminator="type"),
+    ],
+    name="InterleavedContentItem",
+)
+
+# accept a single "str" as a special case since it is common
+InterleavedContent = register_schema(
+    Union[str, InterleavedContentItem, List[InterleavedContentItem]],
+    name="InterleavedContent",
+)
--- a/llama_stack/apis/common/deployment_types.py
+++ b/llama_stack/apis/common/deployment_types.py
@ -7,12 +7,12 @@
 from enum import Enum
 from typing import Any, Dict, Optional

-from llama_models.llama3.api.datatypes import URL
-
 from llama_models.schema_utils import json_schema_type

 from pydantic import BaseModel

+from llama_stack.apis.common.content_types import URL
+

@json_schema_type
 class RestAPIMethod(Enum):
--- a/llama_stack/apis/common/type_system.py
+++ b/llama_stack/apis/common/type_system.py
@ -6,6 +6,7 @@

 from typing import Literal, Union

+from llama_models.schema_utils import register_schema
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated

@ -53,21 +54,24 @@ class AgentTurnInputType(BaseModel):
    type: Literal["agent_turn_input"] = "agent_turn_input"


-ParamType = Annotated[
-    Union[
-        StringType,
-        NumberType,
-        BooleanType,
-        ArrayType,
-        ObjectType,
-        JsonType,
-        UnionType,
-        ChatCompletionInputType,
-        CompletionInputType,
-        AgentTurnInputType,
+ParamType = register_schema(
+    Annotated[
+        Union[
+            StringType,
+            NumberType,
+            BooleanType,
+            ArrayType,
+            ObjectType,
+            JsonType,
+            UnionType,
+            ChatCompletionInputType,
+            CompletionInputType,
+            AgentTurnInputType,
+        ],
+        Field(discriminator="type"),
    ],
-    Field(discriminator="type"),
-]
+    name="ParamType",
+)

 # TODO: recursive definition of ParamType in these containers
 # will cause infinite recursion in OpenAPI generation script