actually test strutured output in completion

2025-07-29 07:14:20 +00:00 · 2024-10-24 14:44:31 -07:00 · 2024-10-24 14:44:31 -07:00 · 9bf1388429
commit 9bf1388429
parent 3796dbd4a5
3 changed files with 35 additions and 26 deletions
--- a/llama_stack/providers/adapters/inference/tgi/tgi.py
+++ b/llama_stack/providers/adapters/inference/tgi/tgi.py
@ -82,6 +82,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
            model=model,
            content=content,
            sampling_params=sampling_params,
+            response_format=response_format,
            stream=stream,
            logprobs=logprobs,
        )
--- a/llama_stack/providers/tests/inference/test_inference.py
+++ b/llama_stack/providers/tests/inference/test_inference.py
@ -185,33 +185,30 @@ async def test_completions_structured_output(inference_settings):
            "Other inference providers don't support structured output in completions yet"
        )

-        class Animals(BaseModel):
-            location: str
-            activity: str
-            animals_seen: conint(ge=1, le=5)  # Constrained integer type
-            animals: List[str]
+    class Output(BaseModel):
+        name: str
+        year_born: str
+        year_retired: str

-        user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
-        response = await inference_impl.completion(
-            content=f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.schema()}",
-            stream=False,
-            model=params["model"],
-            sampling_params=SamplingParams(
-                max_tokens=50,
-            ),
-            response_format=JsonResponseFormat(
-                schema=Animals.model_json_schema(),
-            ),
-            **inference_settings["common_params"],
-        )
-        assert isinstance(response, CompletionResponse)
-        assert isinstance(response.completion_message.content, str)
+    user_input = "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003."
+    response = await inference_impl.completion(
+        content=f"input: '{user_input}'. the schema for json: {Output.schema()}, the json is: ",
+        stream=False,
+        model=params["model"],
+        sampling_params=SamplingParams(
+            max_tokens=50,
+        ),
+        response_format=JsonResponseFormat(
+            schema=Output.model_json_schema(),
+        ),
+    )
+    assert isinstance(response, CompletionResponse)
+    assert isinstance(response.content, str)

-        answer = Animals.parse_raw(response.completion_message.content)
-        assert answer.activity == "bike ride"
-        assert answer.animals == ["puppy", "cat", "raccoon"]
-        assert answer.animals_seen == 3
-        assert answer.location == "park"
+    answer = Output.parse_raw(response.content)
+    assert "Michael Jordan" in answer.name
+    assert answer.year_born == "1963"
+    assert answer.year_retired == "2003"


@pytest.mark.asyncio
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -64,7 +64,18 @@ def process_completion_response(
    response: OpenAICompatCompletionResponse, formatter: ChatFormat
 ) -> CompletionResponse:
    choice = response.choices[0]
-
+    # drop suffix <eot_id> if present and return stop reason as end of turn
+    if choice.text.endswith("<|eot_id|>"):
+        return CompletionResponse(
+            stop_reason=StopReason.end_of_turn,
+            content=choice.text[: -len("<|eot_id|>")],
+        )
+    # drop suffix <eom_id> if present and return stop reason as end of message
+    if choice.text.endswith("<|eom_id|>"):
+        return CompletionResponse(
+            stop_reason=StopReason.end_of_message,
+            content=choice.text[: -len("<|eom_id|>")],
+        )
    return CompletionResponse(
        stop_reason=get_stop_reason(choice.finish_reason),
        content=choice.text,