[#391] Add support for json structured output for vLLM

2025-12-17 05:31:23 +00:00 · 2024-11-26 09:40:17 +00:00 · 2024-11-26 09:40:17 +00:00 · 1801aa145d
commit 1801aa145d
parent 4e6c984c26
2 changed files with 13 additions and 0 deletions
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -100,6 +100,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            tool_prompt_format=tool_prompt_format,
            stream=stream,
            logprobs=logprobs,
+            response_format=response_format,
        )
        if stream:
            return self._stream_chat_completion(request, self.client)
@ -180,6 +181,16 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
                self.formatter,
            )

+        if fmt := request.response_format:
+            if fmt.type == ResponseFormatType.json_schema.value:
+                input_dict["extra_body"] = {
+                    "guided_json": request.response_format.json_schema
+                }
+            elif fmt.type == ResponseFormatType.grammar.value:
+                raise NotImplementedError("Grammar response format not supported yet")
+            else:
+                raise ValueError(f"Unknown response format {fmt.type}")
+
        return {
            "model": request.model,
            **input_dict,
--- a/llama_stack/providers/tests/inference/test_text_inference.py
+++ b/llama_stack/providers/tests/inference/test_text_inference.py
@ -139,6 +139,7 @@ class TestInference:
            "remote::tgi",
            "remote::together",
            "remote::fireworks",
+            "remote::vllm",
        ):
            pytest.skip(
                "Other inference providers don't support structured output in completions yet"
@ -198,6 +199,7 @@ class TestInference:
            "remote::fireworks",
            "remote::tgi",
            "remote::together",
+            "remote::vllm",
            "remote::nvidia",
        ):
            pytest.skip("Other inference providers don't support structured output yet")