diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index f6dd1c8dc..139314776 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -3761,22 +3761,29 @@ "ImageContentItem": { "type": "object", "properties": { - "url": { - "$ref": "#/components/schemas/URL" - }, - "data": { - "type": "string", - "contentEncoding": "base64" - }, "type": { "type": "string", "const": "image", "default": "image" + }, + "image": { + "type": "object", + "properties": { + "url": { + "$ref": "#/components/schemas/URL" + }, + "data": { + "type": "string", + "contentEncoding": "base64" + } + }, + "additionalProperties": false } }, "additionalProperties": false, "required": [ - "type" + "type", + "image" ] }, "InterleavedContent": { @@ -4518,7 +4525,7 @@ "const": "image", "default": "image" }, - "data": { + "image": { "type": "string", "contentEncoding": "base64" } @@ -4526,7 +4533,7 @@ "additionalProperties": false, "required": [ "type", - "data" + "image" ] }, "TextDelta": { @@ -4570,7 +4577,7 @@ "const": "tool_call", "default": "tool_call" }, - "content": { + "tool_call": { "oneOf": [ { "type": "string" @@ -4587,7 +4594,7 @@ "additionalProperties": false, "required": [ "type", - "content", + "tool_call", "parse_status" ] }, diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 6bbaadf8d..1a8c44bc0 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -926,22 +926,27 @@ components: ImageContentItem: additionalProperties: false properties: - data: - contentEncoding: base64 - type: string + image: + additionalProperties: false + properties: + data: + contentEncoding: base64 + type: string + url: + $ref: '#/components/schemas/URL' + type: object type: const: image default: image type: string - url: - $ref: '#/components/schemas/URL' required: - type + - image type: object ImageDelta: additionalProperties: false properties: - data: + image: contentEncoding: base64 type: string type: @@ -950,7 +955,7 @@ components: type: string required: - type - - data + - image type: object InferenceStep: additionalProperties: false @@ -2748,19 +2753,19 @@ components: ToolCallDelta: additionalProperties: false properties: - content: + parse_status: + $ref: '#/components/schemas/ToolCallParseStatus' + tool_call: oneOf: - type: string - $ref: '#/components/schemas/ToolCall' - parse_status: - $ref: '#/components/schemas/ToolCallParseStatus' type: const: tool_call default: tool_call type: string required: - type - - content + - tool_call - parse_status type: object ToolCallParseStatus: diff --git a/llama_stack/apis/agents/event_logger.py b/llama_stack/apis/agents/event_logger.py index ddb2a7cf4..7a607ffda 100644 --- a/llama_stack/apis/agents/event_logger.py +++ b/llama_stack/apis/agents/event_logger.py @@ -137,7 +137,7 @@ class EventLogger: event, LogEvent( role=None, - content=delta.content, + content=delta.tool_call, end="", color="cyan", ), diff --git a/llama_stack/apis/common/content_types.py b/llama_stack/apis/common/content_types.py index b845d09dd..1d8cea567 100644 --- a/llama_stack/apis/common/content_types.py +++ b/llama_stack/apis/common/content_types.py @@ -38,8 +38,9 @@ class _URLOrData(BaseModel): @json_schema_type -class ImageContentItem(_URLOrData): +class ImageContentItem(BaseModel): type: Literal["image"] = "image" + image: _URLOrData @json_schema_type @@ -73,7 +74,7 @@ class TextDelta(BaseModel): @json_schema_type class ImageDelta(BaseModel): type: Literal["image"] = "image" - data: bytes + image: bytes @json_schema_type @@ -91,7 +92,7 @@ class ToolCallDelta(BaseModel): # you either send an in-progress tool call so the client can stream a long # code generation or you send the final parsed tool call at the end of the # stream - content: Union[str, ToolCall] + tool_call: Union[str, ToolCall] parse_status: ToolCallParseStatus diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 75fd75afc..1b375fba7 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -423,7 +423,7 @@ class ChatAgent(ShieldRunnerMixin): step_id=step_id, delta=ToolCallDelta( parse_status=ToolCallParseStatus.succeeded, - content=ToolCall( + tool_call=ToolCall( call_id="", tool_name=MEMORY_QUERY_TOOL, arguments={}, @@ -525,7 +525,7 @@ class ChatAgent(ShieldRunnerMixin): delta = event.delta if delta.type == "tool_call": if delta.parse_status == ToolCallParseStatus.succeeded: - tool_calls.append(delta.content) + tool_calls.append(delta.tool_call) if stream: yield AgentTurnResponseStreamChunk( event=AgentTurnResponseEvent( @@ -639,7 +639,7 @@ class ChatAgent(ShieldRunnerMixin): tool_call=tool_call, delta=ToolCallDelta( parse_status=ToolCallParseStatus.in_progress, - content=tool_call, + tool_call=tool_call, ), ) ) diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index 31ad6fa28..73962ca7f 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -377,7 +377,7 @@ class MetaReferenceInferenceImpl( event=ChatCompletionResponseEvent( event_type=ChatCompletionResponseEventType.progress, delta=ToolCallDelta( - content="", + tool_call="", parse_status=ToolCallParseStatus.started, ), ) @@ -395,7 +395,7 @@ class MetaReferenceInferenceImpl( if ipython: delta = ToolCallDelta( - content=text, + tool_call=text, parse_status=ToolCallParseStatus.in_progress, ) else: @@ -434,7 +434,7 @@ class MetaReferenceInferenceImpl( event=ChatCompletionResponseEvent( event_type=ChatCompletionResponseEventType.progress, delta=ToolCallDelta( - content="", + tool_call="", parse_status=ToolCallParseStatus.failed, ), stop_reason=stop_reason, @@ -446,7 +446,7 @@ class MetaReferenceInferenceImpl( event=ChatCompletionResponseEvent( event_type=ChatCompletionResponseEventType.progress, delta=ToolCallDelta( - content=tool_call, + tool_call=tool_call, parse_status=ToolCallParseStatus.succeeded, ), stop_reason=stop_reason, diff --git a/llama_stack/providers/remote/inference/groq/groq_utils.py b/llama_stack/providers/remote/inference/groq/groq_utils.py index b614c90f4..bd1a07d7c 100644 --- a/llama_stack/providers/remote/inference/groq/groq_utils.py +++ b/llama_stack/providers/remote/inference/groq/groq_utils.py @@ -218,7 +218,7 @@ async def convert_chat_completion_response_stream( event=ChatCompletionResponseEvent( event_type=event_type, delta=ToolCallDelta( - content=tool_call, + tool_call=tool_call, parse_status=ToolCallParseStatus.succeeded, ), ) diff --git a/llama_stack/providers/remote/inference/nvidia/openai_utils.py b/llama_stack/providers/remote/inference/nvidia/openai_utils.py index e85c8dd21..0f753f80d 100644 --- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py +++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py @@ -505,7 +505,9 @@ async def convert_openai_chat_completion_stream( event=ChatCompletionResponseEvent( event_type=next(event_type), delta=ToolCallDelta( - content=_convert_openai_tool_calls(choice.delta.tool_calls)[0], + tool_call=_convert_openai_tool_calls(choice.delta.tool_calls)[ + 0 + ], parse_status=ToolCallParseStatus.succeeded, ), logprobs=_convert_openai_logprobs(choice.logprobs), diff --git a/llama_stack/providers/tests/inference/groq/test_groq_utils.py b/llama_stack/providers/tests/inference/groq/test_groq_utils.py index 0402a772c..f6f593f16 100644 --- a/llama_stack/providers/tests/inference/groq/test_groq_utils.py +++ b/llama_stack/providers/tests/inference/groq/test_groq_utils.py @@ -472,7 +472,7 @@ class TestConvertStreamChatCompletionResponse: iter = converted.__aiter__() chunk = await iter.__anext__() assert chunk.event.event_type == ChatCompletionResponseEventType.start - assert chunk.event.delta.content == ToolCall( + assert chunk.event.delta.tool_call == ToolCall( call_id="tool_call_id", tool_name="get_flight_info", arguments={"origin": "AU", "destination": "LAX"}, diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py index cbc8232c8..c39556b8e 100644 --- a/llama_stack/providers/tests/inference/test_text_inference.py +++ b/llama_stack/providers/tests/inference/test_text_inference.py @@ -470,16 +470,16 @@ class TestInference: ) first = grouped[ChatCompletionResponseEventType.progress][0] if not isinstance( - first.event.delta.content, ToolCall + first.event.delta.tool_call, ToolCall ): # first chunk may contain entire call assert first.event.delta.parse_status == ToolCallParseStatus.started last = grouped[ChatCompletionResponseEventType.progress][-1] # assert last.event.stop_reason == expected_stop_reason assert last.event.delta.parse_status == ToolCallParseStatus.succeeded - assert isinstance(last.event.delta.content, ToolCall) + assert isinstance(last.event.delta.tool_call, ToolCall) - call = last.event.delta.content + call = last.event.delta.tool_call assert call.tool_name == "get_weather" assert "location" in call.arguments assert "San Francisco" in call.arguments["location"] diff --git a/llama_stack/providers/tests/inference/test_vision_inference.py b/llama_stack/providers/tests/inference/test_vision_inference.py index df2f3cfb9..100a70236 100644 --- a/llama_stack/providers/tests/inference/test_vision_inference.py +++ b/llama_stack/providers/tests/inference/test_vision_inference.py @@ -32,13 +32,15 @@ class TestVisionModelInference: "image, expected_strings", [ ( - ImageContentItem(data=PASTA_IMAGE), + ImageContentItem(image=dict(data=PASTA_IMAGE)), ["spaghetti"], ), ( ImageContentItem( - url=URL( - uri="https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + image=dict( + url=URL( + uri="https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + ) ) ), ["puppy"], @@ -103,8 +105,10 @@ class TestVisionModelInference: images = [ ImageContentItem( - url=URL( - uri="https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + image=dict( + url=URL( + uri="https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + ) ) ), ] diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 127fd19f3..6c93f49c0 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -240,7 +240,7 @@ async def process_chat_completion_stream_response( event=ChatCompletionResponseEvent( event_type=ChatCompletionResponseEventType.progress, delta=ToolCallDelta( - content="", + tool_call="", parse_status=ToolCallParseStatus.started, ), ) @@ -260,7 +260,7 @@ async def process_chat_completion_stream_response( if ipython: buffer += text delta = ToolCallDelta( - content=text, + tool_call=text, parse_status=ToolCallParseStatus.in_progress, ) @@ -289,7 +289,7 @@ async def process_chat_completion_stream_response( event=ChatCompletionResponseEvent( event_type=ChatCompletionResponseEventType.progress, delta=ToolCallDelta( - content="", + tool_call="", parse_status=ToolCallParseStatus.failed, ), stop_reason=stop_reason, @@ -301,7 +301,7 @@ async def process_chat_completion_stream_response( event=ChatCompletionResponseEvent( event_type=ChatCompletionResponseEventType.progress, delta=ToolCallDelta( - content=tool_call, + tool_call=tool_call, parse_status=ToolCallParseStatus.succeeded, ), stop_reason=stop_reason, diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py index 701b2ca3b..f5298d844 100644 --- a/llama_stack/providers/utils/inference/prompt_adapter.py +++ b/llama_stack/providers/utils/inference/prompt_adapter.py @@ -113,28 +113,29 @@ async def interleaved_content_convert_to_raw( elif isinstance(c, TextContentItem): return RawTextItem(text=c.text) elif isinstance(c, ImageContentItem): - if c.url: + image = c.image + if image.url: # Load image bytes from URL - if c.url.uri.startswith("data"): - match = re.match(r"data:image/(\w+);base64,(.+)", c.url.uri) + if image.url.uri.startswith("data"): + match = re.match(r"data:image/(\w+);base64,(.+)", image.url.uri) if not match: raise ValueError( - f"Invalid data URL format, {c.url.uri[:40]}..." + f"Invalid data URL format, {image.url.uri[:40]}..." ) _, image_data = match.groups() data = base64.b64decode(image_data) - elif c.url.uri.startswith("file://"): - path = c.url.uri[len("file://") :] + elif image.url.uri.startswith("file://"): + path = image.url.uri[len("file://") :] with open(path, "rb") as f: data = f.read() # type: ignore - elif c.url.uri.startswith("http"): + elif image.url.uri.startswith("http"): async with httpx.AsyncClient() as client: - response = await client.get(c.url.uri) + response = await client.get(image.url.uri) data = response.content else: raise ValueError("Unsupported URL type") - elif c.data: - data = c.data + elif image.data: + data = image.data else: raise ValueError("No data or URL provided") @@ -170,26 +171,29 @@ def request_has_media(request: Union[ChatCompletionRequest, CompletionRequest]): async def localize_image_content(media: ImageContentItem) -> Tuple[bytes, str]: - if media.url and media.url.uri.startswith("http"): + image = media.image + if image.url and image.url.uri.startswith("http"): async with httpx.AsyncClient() as client: - r = await client.get(media.url.uri) + r = await client.get(image.url.uri) content = r.content content_type = r.headers.get("content-type") if content_type: format = content_type.split("/")[-1] else: format = "png" + return content, format else: - image = PIL_Image.open(io.BytesIO(media.data)) - return media.data, image.format + pil_image = PIL_Image.open(io.BytesIO(image.data)) + return image.data, pil_image.format async def convert_image_content_to_url( media: ImageContentItem, download: bool = False, include_format: bool = True ) -> str: - if media.url and (not download or media.url.uri.startswith("data")): - return media.url.uri + image = media.image + if image.url and (not download or image.url.uri.startswith("data")): + return image.url.uri content, format = await localize_image_content(media) if include_format: diff --git a/tests/client-sdk/inference/test_inference.py b/tests/client-sdk/inference/test_inference.py index 08c7e1693..b1f1dd139 100644 --- a/tests/client-sdk/inference/test_inference.py +++ b/tests/client-sdk/inference/test_inference.py @@ -258,7 +258,7 @@ def extract_tool_invocation_content(response): for chunk in response: delta = chunk.event.delta if delta.type == "tool_call" and delta.parse_status == "succeeded": - call = delta.content + call = delta.tool_call tool_invocation_content += f"[{call.tool_name}, {call.arguments}]" return tool_invocation_content @@ -321,9 +321,11 @@ def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id "content": [ { "type": "image", - "url": { - # TODO: Replace with Github based URI to resources/sample1.jpg - "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + "image": { + "url": { + # TODO: Replace with Github based URI to resources/sample1.jpg + "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + }, }, }, { @@ -348,9 +350,11 @@ def test_image_chat_completion_streaming(llama_stack_client, vision_model_id): "content": [ { "type": "image", - "url": { - # TODO: Replace with Github based URI to resources/sample1.jpg - "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + "image": { + "url": { + # TODO: Replace with Github based URI to resources/sample1.jpg + "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + }, }, }, { @@ -374,14 +378,15 @@ def test_image_chat_completion_streaming(llama_stack_client, vision_model_id): def test_image_chat_completion_base64_url( llama_stack_client, vision_model_id, base64_image_url ): - message = { "role": "user", "content": [ { "type": "image", - "url": { - "uri": base64_image_url, + "image": { + "url": { + "uri": base64_image_url, + }, }, }, { diff --git a/tests/client-sdk/safety/test_safety.py b/tests/client-sdk/safety/test_safety.py index 6af417a09..ac3221364 100644 --- a/tests/client-sdk/safety/test_safety.py +++ b/tests/client-sdk/safety/test_safety.py @@ -141,7 +141,7 @@ def test_safety_with_image(llama_stack_client, model_providers): }, { "type": "image", - "url": {"uri": data_url_from_image(file_path)}, + "image": {"url": {"uri": data_url_from_image(file_path)}}, }, ], }