fix ImageContentItem to take base64 string as image.data (#909)

# What does this PR do?

- Discussion in
https://github.com/meta-llama/llama-stack/pull/906#discussion_r1936260819

- image.data should accept base64 string as input instead of binary
bytes, change prompt_adapter to account for that.

## Test Plan

```
pytest -v tests/client-sdk/inference/test_inference.py
```

with test in https://github.com/meta-llama/llama-stack/pull/906

## Sources

Please link relevant resources if necessary.


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
This commit is contained in:
Xi Yan 2025-01-30 15:58:23 -08:00 committed by GitHub
parent 7fe2592795
commit 94051cfe9e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 85 additions and 31 deletions

View file

@ -248,7 +248,9 @@ class JsonSchemaGenerator:
type_schema.update(self._metadata_to_schema(m)) type_schema.update(self._metadata_to_schema(m))
return type_schema return type_schema
def _simple_type_to_schema(self, typ: TypeLike) -> Optional[Schema]: def _simple_type_to_schema(
self, typ: TypeLike, json_schema_extra: Optional[dict] = None
) -> Optional[Schema]:
""" """
Returns the JSON schema associated with a simple, unrestricted type. Returns the JSON schema associated with a simple, unrestricted type.
@ -264,6 +266,11 @@ class JsonSchemaGenerator:
elif typ is float: elif typ is float:
return {"type": "number"} return {"type": "number"}
elif typ is str: elif typ is str:
if json_schema_extra and "contentEncoding" in json_schema_extra:
return {
"type": "string",
"contentEncoding": json_schema_extra["contentEncoding"],
}
return {"type": "string"} return {"type": "string"}
elif typ is bytes: elif typ is bytes:
return {"type": "string", "contentEncoding": "base64"} return {"type": "string", "contentEncoding": "base64"}
@ -303,7 +310,12 @@ class JsonSchemaGenerator:
# not a simple type # not a simple type
return None return None
def type_to_schema(self, data_type: TypeLike, force_expand: bool = False) -> Schema: def type_to_schema(
self,
data_type: TypeLike,
force_expand: bool = False,
json_schema_extra: Optional[dict] = None,
) -> Schema:
""" """
Returns the JSON schema associated with a type. Returns the JSON schema associated with a type.
@ -313,7 +325,7 @@ class JsonSchemaGenerator:
""" """
# short-circuit for common simple types # short-circuit for common simple types
schema = self._simple_type_to_schema(data_type) schema = self._simple_type_to_schema(data_type, json_schema_extra)
if schema is not None: if schema is not None:
return schema return schema
@ -486,15 +498,9 @@ class JsonSchemaGenerator:
property_docstrings = get_class_property_docstrings( property_docstrings = get_class_property_docstrings(
typ, self.options.property_description_fun typ, self.options.property_description_fun
) )
properties: Dict[str, Schema] = {} properties: Dict[str, Schema] = {}
required: List[str] = [] required: List[str] = []
for property_name, property_type in get_class_properties(typ): for property_name, property_type in get_class_properties(typ):
defaults = {}
if "model_fields" in members:
f = members["model_fields"]
defaults = {k: finfo.default for k, finfo in f.items()}
# rename property if an alias name is specified # rename property if an alias name is specified
alias = get_annotation(property_type, Alias) alias = get_annotation(property_type, Alias)
if alias: if alias:
@ -502,11 +508,22 @@ class JsonSchemaGenerator:
else: else:
output_name = property_name output_name = property_name
defaults = {}
json_schema_extra = None
if "model_fields" in members:
f = members["model_fields"]
defaults = {k: finfo.default for k, finfo in f.items()}
json_schema_extra = f.get(output_name, None).json_schema_extra
if is_type_optional(property_type): if is_type_optional(property_type):
optional_type: type = unwrap_optional_type(property_type) optional_type: type = unwrap_optional_type(property_type)
property_def = self.type_to_schema(optional_type) property_def = self.type_to_schema(
optional_type, json_schema_extra=json_schema_extra
)
else: else:
property_def = self.type_to_schema(property_type) property_def = self.type_to_schema(
property_type, json_schema_extra=json_schema_extra
)
required.append(output_name) required.append(output_name)
# check if attribute has a default value initializer # check if attribute has a default value initializer

View file

@ -2439,27 +2439,32 @@
"type": { "type": {
"type": "string", "type": "string",
"const": "image", "const": "image",
"default": "image" "default": "image",
"description": "Discriminator type of the content item. Always \"image\""
}, },
"image": { "image": {
"type": "object", "type": "object",
"properties": { "properties": {
"url": { "url": {
"$ref": "#/components/schemas/URL" "$ref": "#/components/schemas/URL",
"description": "A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits."
}, },
"data": { "data": {
"type": "string", "type": "string",
"contentEncoding": "base64" "contentEncoding": "base64",
"description": "base64 encoded image data as string"
} }
}, },
"additionalProperties": false "additionalProperties": false,
"description": "Image as a base64 encoded string or an URL"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"type", "type",
"image" "image"
] ],
"title": "A image content item"
}, },
"InterleavedContent": { "InterleavedContent": {
"oneOf": [ "oneOf": [
@ -2647,17 +2652,20 @@
"type": { "type": {
"type": "string", "type": "string",
"const": "text", "const": "text",
"default": "text" "default": "text",
"description": "Discriminator type of the content item. Always \"text\""
}, },
"text": { "text": {
"type": "string" "type": "string",
"description": "Text content"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"type", "type",
"text" "text"
] ],
"title": "A text content item"
}, },
"ToolCall": { "ToolCall": {
"type": "object", "type": "object",

View file

@ -1466,19 +1466,28 @@ components:
type: string type: string
const: image const: image
default: image default: image
description: >-
Discriminator type of the content item. Always "image"
image: image:
type: object type: object
properties: properties:
url: url:
$ref: '#/components/schemas/URL' $ref: '#/components/schemas/URL'
description: >-
A URL of the image or data URL in the format of data:image/{type};base64,{data}.
Note that URL could have length limits.
data: data:
type: string type: string
contentEncoding: base64 contentEncoding: base64
description: base64 encoded image data as string
additionalProperties: false additionalProperties: false
description: >-
Image as a base64 encoded string or an URL
additionalProperties: false additionalProperties: false
required: required:
- type - type
- image - image
title: A image content item
InterleavedContent: InterleavedContent:
oneOf: oneOf:
- type: string - type: string
@ -1598,12 +1607,16 @@ components:
type: string type: string
const: text const: text
default: text default: text
description: >-
Discriminator type of the content item. Always "text"
text: text:
type: string type: string
description: Text content
additionalProperties: false additionalProperties: false
required: required:
- type - type
- text - text
title: A text content item
ToolCall: ToolCall:
type: object type: object
properties: properties:

View file

@ -4,14 +4,13 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import base64
from enum import Enum from enum import Enum
from typing import Annotated, List, Literal, Optional, Union from typing import Annotated, List, Literal, Optional, Union
from llama_models.llama3.api.datatypes import ToolCall from llama_models.llama3.api.datatypes import ToolCall
from llama_models.schema_utils import json_schema_type, register_schema from llama_models.schema_utils import json_schema_type, register_schema
from pydantic import BaseModel, Field, field_serializer, model_validator from pydantic import BaseModel, Field, model_validator
@json_schema_type @json_schema_type
@ -20,8 +19,16 @@ class URL(BaseModel):
class _URLOrData(BaseModel): class _URLOrData(BaseModel):
"""
A URL or a base64 encoded string
:param url: A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits.
:param data: base64 encoded image data as string
"""
url: Optional[URL] = None url: Optional[URL] = None
data: Optional[bytes] = None # data is a base64 encoded string, hint with contentEncoding=base64
data: Optional[str] = Field(contentEncoding="base64", default=None)
@model_validator(mode="before") @model_validator(mode="before")
@classmethod @classmethod
@ -30,21 +37,27 @@ class _URLOrData(BaseModel):
return values return values
return {"url": values} return {"url": values}
@field_serializer("data")
def serialize_data(self, data: Optional[bytes], _info):
if data is None:
return None
return base64.b64encode(data).decode("utf-8")
@json_schema_type @json_schema_type
class ImageContentItem(BaseModel): class ImageContentItem(BaseModel):
"""A image content item
:param type: Discriminator type of the content item. Always "image"
:param image: Image as a base64 encoded string or an URL
"""
type: Literal["image"] = "image" type: Literal["image"] = "image"
image: _URLOrData image: _URLOrData
@json_schema_type @json_schema_type
class TextContentItem(BaseModel): class TextContentItem(BaseModel):
"""A text content item
:param type: Discriminator type of the content item. Always "text"
:param text: Text content
"""
type: Literal["text"] = "text" type: Literal["text"] = "text"
text: str text: str

View file

@ -135,7 +135,8 @@ async def interleaved_content_convert_to_raw(
else: else:
raise ValueError("Unsupported URL type") raise ValueError("Unsupported URL type")
elif image.data: elif image.data:
data = image.data # data is a base64 encoded string, decode it to bytes for RawMediaItem
data = base64.b64decode(image.data)
else: else:
raise ValueError("No data or URL provided") raise ValueError("No data or URL provided")
@ -184,8 +185,10 @@ async def localize_image_content(media: ImageContentItem) -> Tuple[bytes, str]:
return content, format return content, format
else: else:
pil_image = PIL_Image.open(io.BytesIO(image.data)) # data is a base64 encoded string, decode it to bytes first
return image.data, pil_image.format data_bytes = base64.b64decode(image.data)
pil_image = PIL_Image.open(io.BytesIO(data_bytes))
return data_bytes, pil_image.format
async def convert_image_content_to_url( async def convert_image_content_to_url(