Update the "InterleavedTextMedia" type (#635)

## What does this PR do?

This is a long-pending change and particularly important to get done
now.

Specifically:
- we cannot "localize" (aka download) any URLs from media attachments
anywhere near our modeling code. it must be done within llama-stack.
- `PIL.Image` is infesting all our APIs via `ImageMedia ->
InterleavedTextMedia` and that cannot be right at all. Anything in the
API surface must be "naturally serializable". We need a standard `{
type: "image", image_url: "<...>" }` which is more extensible
- `UserMessage`, `SystemMessage`, etc. are moved completely to
llama-stack from the llama-models repository.

See https://github.com/meta-llama/llama-models/pull/244 for the
corresponding PR in llama-models.

## Test Plan

```bash
cd llama_stack/providers/tests

pytest -s -v -k "fireworks or ollama or together" inference/test_vision_inference.py
pytest -s -v -k "(fireworks or ollama or together) and llama_3b" inference/test_text_inference.py
pytest -s -v -k chroma memory/test_memory.py \
  --env EMBEDDING_DIMENSION=384 --env CHROMA_DB_PATH=/tmp/foobar

pytest -s -v -k fireworks agents/test_agents.py  \
   --safety-shield=meta-llama/Llama-Guard-3-8B \
   --inference-model=meta-llama/Llama-3.1-8B-Instruct
```

Updated the client sdk (see PR ...), installed the SDK in the same
environment and then ran the SDK tests:

```bash
cd tests/client-sdk
LLAMA_STACK_CONFIG=together pytest -s -v agents/test_agents.py
LLAMA_STACK_CONFIG=ollama pytest -s -v memory/test_memory.py

# this one needed a bit of hacking in the run.yaml to ensure I could register the vision model correctly
INFERENCE_MODEL=llama3.2-vision:latest LLAMA_STACK_CONFIG=ollama pytest -s -v inference/test_inference.py
```
This commit is contained in:
Ashwin Bharambe 2024-12-17 11:18:31 -08:00 committed by GitHub
parent 10eb31badf
commit 8de8eb03c8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
66 changed files with 1344 additions and 1801 deletions

View file

@ -29,11 +29,12 @@ from llama_stack.apis.common.deployment_types import * # noqa: F403
from llama_stack.apis.inference import * # noqa: F403
from llama_stack.apis.safety import * # noqa: F403
from llama_stack.apis.memory import * # noqa: F403
from llama_stack.apis.common.content_types import InterleavedContent, URL
@json_schema_type
class Attachment(BaseModel):
content: InterleavedTextMedia | URL
content: InterleavedContent | URL
mime_type: str
@ -102,20 +103,20 @@ class _MemoryBankConfigCommon(BaseModel):
class AgentVectorMemoryBankConfig(_MemoryBankConfigCommon):
type: Literal[MemoryBankType.vector.value] = MemoryBankType.vector.value
type: Literal["vector"] = "vector"
class AgentKeyValueMemoryBankConfig(_MemoryBankConfigCommon):
type: Literal[MemoryBankType.keyvalue.value] = MemoryBankType.keyvalue.value
type: Literal["keyvalue"] = "keyvalue"
keys: List[str] # what keys to focus on
class AgentKeywordMemoryBankConfig(_MemoryBankConfigCommon):
type: Literal[MemoryBankType.keyword.value] = MemoryBankType.keyword.value
type: Literal["keyword"] = "keyword"
class AgentGraphMemoryBankConfig(_MemoryBankConfigCommon):
type: Literal[MemoryBankType.graph.value] = MemoryBankType.graph.value
type: Literal["graph"] = "graph"
entities: List[str] # what entities to focus on
@ -230,7 +231,7 @@ class MemoryRetrievalStep(StepCommon):
StepType.memory_retrieval.value
)
memory_bank_ids: List[str]
inserted_context: InterleavedTextMedia
inserted_context: InterleavedContent
Step = Annotated[

View file

@ -17,7 +17,7 @@ from llama_stack.apis.inference import * # noqa: F403
@json_schema_type
class BatchCompletionRequest(BaseModel):
model: str
content_batch: List[InterleavedTextMedia]
content_batch: List[InterleavedContent]
sampling_params: Optional[SamplingParams] = SamplingParams()
logprobs: Optional[LogProbConfig] = None
@ -53,7 +53,7 @@ class BatchInference(Protocol):
async def batch_completion(
self,
model: str,
content_batch: List[InterleavedTextMedia],
content_batch: List[InterleavedContent],
sampling_params: Optional[SamplingParams] = SamplingParams(),
logprobs: Optional[LogProbConfig] = None,
) -> BatchCompletionResponse: ...

View file

@ -0,0 +1,60 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Annotated, List, Literal, Optional, Union
from llama_models.schema_utils import json_schema_type, register_schema
from pydantic import BaseModel, Field, model_validator
@json_schema_type(
schema={"type": "string", "format": "uri", "pattern": "^(https?://|file://|data:)"}
)
class URL(BaseModel):
uri: str
def __str__(self) -> str:
return self.uri
class _URLOrData(BaseModel):
url: Optional[URL] = None
data: Optional[bytes] = None
@model_validator(mode="before")
@classmethod
def validator(cls, values):
if isinstance(values, dict):
return values
return {"url": values}
@json_schema_type
class ImageContentItem(_URLOrData):
type: Literal["image"] = "image"
@json_schema_type
class TextContentItem(BaseModel):
type: Literal["text"] = "text"
text: str
# other modalities can be added here
InterleavedContentItem = register_schema(
Annotated[
Union[ImageContentItem, TextContentItem],
Field(discriminator="type"),
],
name="InterleavedContentItem",
)
# accept a single "str" as a special case since it is common
InterleavedContent = register_schema(
Union[str, InterleavedContentItem, List[InterleavedContentItem]],
name="InterleavedContent",
)

View file

@ -7,12 +7,12 @@
from enum import Enum
from typing import Any, Dict, Optional
from llama_models.llama3.api.datatypes import URL
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel
from llama_stack.apis.common.content_types import URL
@json_schema_type
class RestAPIMethod(Enum):

View file

@ -6,6 +6,7 @@
from typing import Literal, Union
from llama_models.schema_utils import register_schema
from pydantic import BaseModel, Field
from typing_extensions import Annotated
@ -53,21 +54,24 @@ class AgentTurnInputType(BaseModel):
type: Literal["agent_turn_input"] = "agent_turn_input"
ParamType = Annotated[
Union[
StringType,
NumberType,
BooleanType,
ArrayType,
ObjectType,
JsonType,
UnionType,
ChatCompletionInputType,
CompletionInputType,
AgentTurnInputType,
ParamType = register_schema(
Annotated[
Union[
StringType,
NumberType,
BooleanType,
ArrayType,
ObjectType,
JsonType,
UnionType,
ChatCompletionInputType,
CompletionInputType,
AgentTurnInputType,
],
Field(discriminator="type"),
],
Field(discriminator="type"),
]
name="ParamType",
)
# TODO: recursive definition of ParamType in these containers
# will cause infinite recursion in OpenAPI generation script

View file

@ -6,12 +6,12 @@
from typing import Any, Dict, List, Literal, Optional, Protocol
from llama_models.llama3.api.datatypes import URL
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import ParamType
from llama_stack.apis.resource import Resource, ResourceType

View file

@ -15,6 +15,7 @@ from llama_stack.apis.agents import AgentConfig
from llama_stack.apis.common.job_types import Job, JobStatus
from llama_stack.apis.scoring import * # noqa: F403
from llama_stack.apis.eval_tasks import * # noqa: F403
from llama_stack.apis.inference import SamplingParams, SystemMessage
@json_schema_type

View file

@ -16,14 +16,23 @@ from typing import (
Union,
)
from llama_models.llama3.api.datatypes import (
BuiltinTool,
SamplingParams,
StopReason,
ToolCall,
ToolDefinition,
ToolPromptFormat,
)
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, field_validator
from typing_extensions import Annotated
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from llama_stack.apis.common.content_types import InterleavedContent
from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from llama_stack.apis.models import * # noqa: F403
@ -40,17 +49,17 @@ class QuantizationType(Enum):
@json_schema_type
class Fp8QuantizationConfig(BaseModel):
type: Literal[QuantizationType.fp8.value] = QuantizationType.fp8.value
type: Literal["fp8"] = "fp8"
@json_schema_type
class Bf16QuantizationConfig(BaseModel):
type: Literal[QuantizationType.bf16.value] = QuantizationType.bf16.value
type: Literal["bf16"] = "bf16"
@json_schema_type
class Int4QuantizationConfig(BaseModel):
type: Literal[QuantizationType.int4.value] = QuantizationType.int4.value
type: Literal["int4"] = "int4"
scheme: Optional[str] = "int4_weight_int8_dynamic_activation"
@ -60,6 +69,76 @@ QuantizationConfig = Annotated[
]
@json_schema_type
class UserMessage(BaseModel):
role: Literal["user"] = "user"
content: InterleavedContent
context: Optional[InterleavedContent] = None
@json_schema_type
class SystemMessage(BaseModel):
role: Literal["system"] = "system"
content: InterleavedContent
@json_schema_type
class ToolResponseMessage(BaseModel):
role: Literal["ipython"] = "ipython"
# it was nice to re-use the ToolResponse type, but having all messages
# have a `content` type makes things nicer too
call_id: str
tool_name: Union[BuiltinTool, str]
content: InterleavedContent
@json_schema_type
class CompletionMessage(BaseModel):
role: Literal["assistant"] = "assistant"
content: InterleavedContent
stop_reason: StopReason
tool_calls: List[ToolCall] = Field(default_factory=list)
Message = Annotated[
Union[
UserMessage,
SystemMessage,
ToolResponseMessage,
CompletionMessage,
],
Field(discriminator="role"),
]
@json_schema_type
class ToolResponse(BaseModel):
call_id: str
tool_name: Union[BuiltinTool, str]
content: InterleavedContent
@field_validator("tool_name", mode="before")
@classmethod
def validate_field(cls, v):
if isinstance(v, str):
try:
return BuiltinTool(v)
except ValueError:
return v
return v
@json_schema_type
class ToolChoice(Enum):
auto = "auto"
required = "required"
@json_schema_type
class TokenLogProbs(BaseModel):
logprobs_by_token: Dict[str, float]
@json_schema_type
class ChatCompletionResponseEventType(Enum):
start = "start"
@ -117,7 +196,7 @@ ResponseFormat = Annotated[
@json_schema_type
class CompletionRequest(BaseModel):
model: str
content: InterleavedTextMedia
content: InterleavedContent
sampling_params: Optional[SamplingParams] = SamplingParams()
response_format: Optional[ResponseFormat] = None
@ -146,7 +225,7 @@ class CompletionResponseStreamChunk(BaseModel):
@json_schema_type
class BatchCompletionRequest(BaseModel):
model: str
content_batch: List[InterleavedTextMedia]
content_batch: List[InterleavedContent]
sampling_params: Optional[SamplingParams] = SamplingParams()
response_format: Optional[ResponseFormat] = None
logprobs: Optional[LogProbConfig] = None
@ -230,7 +309,7 @@ class Inference(Protocol):
async def completion(
self,
model_id: str,
content: InterleavedTextMedia,
content: InterleavedContent,
sampling_params: Optional[SamplingParams] = SamplingParams(),
response_format: Optional[ResponseFormat] = None,
stream: Optional[bool] = False,
@ -258,5 +337,5 @@ class Inference(Protocol):
async def embeddings(
self,
model_id: str,
contents: List[InterleavedTextMedia],
contents: List[InterleavedContent],
) -> EmbeddingsResponse: ...

View file

@ -8,27 +8,27 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import List, Optional, Protocol, runtime_checkable
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field
from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_stack.apis.memory_banks import * # noqa: F403
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.inference import InterleavedContent
from llama_stack.apis.memory_banks import MemoryBank
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@json_schema_type
class MemoryBankDocument(BaseModel):
document_id: str
content: InterleavedTextMedia | URL
content: InterleavedContent | URL
mime_type: str | None = None
metadata: Dict[str, Any] = Field(default_factory=dict)
class Chunk(BaseModel):
content: InterleavedTextMedia
content: InterleavedContent
token_count: int
document_id: str
@ -62,6 +62,6 @@ class Memory(Protocol):
async def query_documents(
self,
bank_id: str,
query: InterleavedTextMedia,
query: InterleavedContent,
params: Optional[Dict[str, Any]] = None,
) -> QueryDocumentsResponse: ...

View file

@ -5,16 +5,16 @@
# the root directory of this source tree.
from enum import Enum
from typing import Any, Dict, List, Protocol, runtime_checkable
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel
from pydantic import BaseModel, Field
from llama_stack.apis.inference import Message
from llama_stack.apis.shields import Shield
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_stack.apis.shields import * # noqa: F403
@json_schema_type
class ViolationLevel(Enum):

View file

@ -13,6 +13,7 @@ from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel
from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_stack.apis.inference import Message
class FilteringFunction(Enum):