Update the "InterleavedTextMedia" type (#635)

## What does this PR do? This is a long-pending change and particularly important to get done now. Specifically: - we cannot "localize" (aka download) any URLs from media attachments anywhere near our modeling code. it must be done within llama-stack. - `PIL.Image` is infesting all our APIs via `ImageMedia -> InterleavedTextMedia` and that cannot be right at all. Anything in the API surface must be "naturally serializable". We need a standard `{ type: "image", image_url: "<...>" }` which is more extensible - `UserMessage`, `SystemMessage`, etc. are moved completely to llama-stack from the llama-models repository. See https://github.com/meta-llama/llama-models/pull/244 for the corresponding PR in llama-models. ## Test Plan ```bash cd llama_stack/providers/tests pytest -s -v -k "fireworks or ollama or together" inference/test_vision_inference.py pytest -s -v -k "(fireworks or ollama or together) and llama_3b" inference/test_text_inference.py pytest -s -v -k chroma memory/test_memory.py \ --env EMBEDDING_DIMENSION=384 --env CHROMA_DB_PATH=/tmp/foobar pytest -s -v -k fireworks agents/test_agents.py \ --safety-shield=meta-llama/Llama-Guard-3-8B \ --inference-model=meta-llama/Llama-3.1-8B-Instruct ``` Updated the client sdk (see PR ...), installed the SDK in the same environment and then ran the SDK tests: ```bash cd tests/client-sdk LLAMA_STACK_CONFIG=together pytest -s -v agents/test_agents.py LLAMA_STACK_CONFIG=ollama pytest -s -v memory/test_memory.py # this one needed a bit of hacking in the run.yaml to ensure I could register the vision model correctly INFERENCE_MODEL=llama3.2-vision:latest LLAMA_STACK_CONFIG=ollama pytest -s -v inference/test_inference.py ```
2024-12-17 11:18:31 -08:00 · 2024-12-17 11:18:31 -08:00 · 8de8eb03c8
commit 8de8eb03c8
parent 10eb31badf
66 changed files with 1344 additions and 1801 deletions
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -16,14 +16,23 @@ from typing import (
    Union,
 )

+from llama_models.llama3.api.datatypes import (
+    BuiltinTool,
+    SamplingParams,
+    StopReason,
+    ToolCall,
+    ToolDefinition,
+    ToolPromptFormat,
+)
+
 from llama_models.schema_utils import json_schema_type, webmethod

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 from typing_extensions import Annotated

-from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
+from llama_stack.apis.common.content_types import InterleavedContent

-from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.apis.models import *  # noqa: F403


@ -40,17 +49,17 @@ class QuantizationType(Enum):

@json_schema_type
 class Fp8QuantizationConfig(BaseModel):
-    type: Literal[QuantizationType.fp8.value] = QuantizationType.fp8.value
+    type: Literal["fp8"] = "fp8"


@json_schema_type
 class Bf16QuantizationConfig(BaseModel):
-    type: Literal[QuantizationType.bf16.value] = QuantizationType.bf16.value
+    type: Literal["bf16"] = "bf16"


@json_schema_type
 class Int4QuantizationConfig(BaseModel):
-    type: Literal[QuantizationType.int4.value] = QuantizationType.int4.value
+    type: Literal["int4"] = "int4"
    scheme: Optional[str] = "int4_weight_int8_dynamic_activation"


@ -60,6 +69,76 @@ QuantizationConfig = Annotated[
 ]


+@json_schema_type
+class UserMessage(BaseModel):
+    role: Literal["user"] = "user"
+    content: InterleavedContent
+    context: Optional[InterleavedContent] = None
+
+
+@json_schema_type
+class SystemMessage(BaseModel):
+    role: Literal["system"] = "system"
+    content: InterleavedContent
+
+
+@json_schema_type
+class ToolResponseMessage(BaseModel):
+    role: Literal["ipython"] = "ipython"
+    # it was nice to re-use the ToolResponse type, but having all messages
+    # have a `content` type makes things nicer too
+    call_id: str
+    tool_name: Union[BuiltinTool, str]
+    content: InterleavedContent
+
+
+@json_schema_type
+class CompletionMessage(BaseModel):
+    role: Literal["assistant"] = "assistant"
+    content: InterleavedContent
+    stop_reason: StopReason
+    tool_calls: List[ToolCall] = Field(default_factory=list)
+
+
+Message = Annotated[
+    Union[
+        UserMessage,
+        SystemMessage,
+        ToolResponseMessage,
+        CompletionMessage,
+    ],
+    Field(discriminator="role"),
+]
+
+
+@json_schema_type
+class ToolResponse(BaseModel):
+    call_id: str
+    tool_name: Union[BuiltinTool, str]
+    content: InterleavedContent
+
+    @field_validator("tool_name", mode="before")
+    @classmethod
+    def validate_field(cls, v):
+        if isinstance(v, str):
+            try:
+                return BuiltinTool(v)
+            except ValueError:
+                return v
+        return v
+
+
+@json_schema_type
+class ToolChoice(Enum):
+    auto = "auto"
+    required = "required"
+
+
+@json_schema_type
+class TokenLogProbs(BaseModel):
+    logprobs_by_token: Dict[str, float]
+
+
@json_schema_type
 class ChatCompletionResponseEventType(Enum):
    start = "start"
@ -117,7 +196,7 @@ ResponseFormat = Annotated[
@json_schema_type
 class CompletionRequest(BaseModel):
    model: str
-    content: InterleavedTextMedia
+    content: InterleavedContent
    sampling_params: Optional[SamplingParams] = SamplingParams()
    response_format: Optional[ResponseFormat] = None

@ -146,7 +225,7 @@ class CompletionResponseStreamChunk(BaseModel):
@json_schema_type
 class BatchCompletionRequest(BaseModel):
    model: str
-    content_batch: List[InterleavedTextMedia]
+    content_batch: List[InterleavedContent]
    sampling_params: Optional[SamplingParams] = SamplingParams()
    response_format: Optional[ResponseFormat] = None
    logprobs: Optional[LogProbConfig] = None
@ -230,7 +309,7 @@ class Inference(Protocol):
    async def completion(
        self,
        model_id: str,
-        content: InterleavedTextMedia,
+        content: InterleavedContent,
        sampling_params: Optional[SamplingParams] = SamplingParams(),
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
@ -258,5 +337,5 @@ class Inference(Protocol):
    async def embeddings(
        self,
        model_id: str,
-        contents: List[InterleavedTextMedia],
+        contents: List[InterleavedContent],
    ) -> EmbeddingsResponse: ...