From fb418813fc40661a851e20768a7f7c08c5cbe461 Mon Sep 17 00:00:00 2001
From: yyymeta <123776235+yyymeta@users.noreply.github.com>
Date: Mon, 17 Mar 2025 13:42:08 -0700
Subject: [PATCH] fix: passthrough impl response.content.text (#1665)

# What does this PR do?
current passthrough impl returns chatcompletion_message.content as a
TextItem() , not a straight string. so it's not compatible with other
providers, and causes parsing error downstream.

change away from the generic pydantic conversion, and explicitly parse
out content.text

## Test Plan

setup llama server with passthrough

```
llama-stack-client eval run-benchmark "MMMU_Pro_standard"   --model-id    meta-llama/Llama-3-8B   --output-dir /tmp/   --num-examples 20
```
works without parsing error
---
 .../remote/inference/passthrough/passthrough.py   | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py
index 8f3a0d147..96b2d73d8 100644
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@@ -12,6 +12,7 @@ from llama_stack.apis.common.content_types import InterleavedContent
 from llama_stack.apis.inference import (
     ChatCompletionResponse,
     ChatCompletionResponseStreamChunk,
+    CompletionMessage,
     EmbeddingsResponse,
     EmbeddingTaskType,
     Inference,
@@ -160,12 +161,14 @@ class PassthroughInferenceAdapter(Inference):
         client = self._get_client()
         response = await client.inference.chat_completion(**json_params)
 
-        response = response.to_dict()
-
-        # temporary hack to remove the metrics from the response
-        response["metrics"] = []
-
-        return convert_to_pydantic(ChatCompletionResponse, response)
+        return ChatCompletionResponse(
+            completion_message=CompletionMessage(
+                content=response.completion_message.content.text,
+                stop_reason=response.completion_message.stop_reason,
+                tool_calls=response.completion_message.tool_calls,
+            ),
+            logprobs=response.logprobs,
+        )
 
     async def _stream_chat_completion(self, json_params: Dict[str, Any]) -> AsyncGenerator:
         client = self._get_client()