fix: address review comments

This commit addresses comments regarding the OpenAI chat completion implementation in the meta_reference provider. Tool Augmentation - Add `augment_raw_messages_for_tools()` to properly inject tool definitions into prompts - Support model-family-specific tool formats: * Llama 3.1/3.2 multimodal: JsonCustomToolGenerator with JSON format * Llama 3.2/3.3/4: PythonListCustomToolGenerator with Python list format - Handle tool_choice hints (auto/required/specific tool) - Preserve existing system messages while adding tool context Streaming & Tool Call Detection - Implement streaming support via `params.stream` with `_stream_chat_completion()` - Add tool call detection by decoding assistant messages after generation - Set proper `finish_reason` based on content ("stop" vs "tool_calls") - Convert internal ToolCall format to OpenAI-compatible types - Stream chunks incrementally with proper delta formatting Type Corrections - Fix response_format handling in generators.py to properly extract schema from OpenAIJSONSchema TypedDict and use correct ResponseFormatType enum - Use correct OpenAI types: OpenAIChatCompletionToolCall, OpenAIChunkChoice, OpenAIChoiceDelta, OpenAIChatCompletionToolCallFunction Signed-off-by: Charlie Doern <cdoern@redhat.com>
2025-12-03 01:48:05 +00:00 · 2025-11-10 15:17:08 -05:00 · 2025-11-10 15:17:08 -05:00 · 1b77826aba
commit 1b77826aba
parent dac1ff1f57
2 changed files with 345 additions and 9 deletions
--- a/src/llama_stack/providers/inline/inference/meta_reference/generators.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/generators.py
@ -14,7 +14,9 @@ from llama_stack.apis.inference import (
    GreedySamplingStrategy,
    JsonSchemaResponseFormat,
    OpenAIChatCompletionRequestWithExtraBody,
+    OpenAIResponseFormatJSONSchema,
    ResponseFormat,
+    ResponseFormatType,
    SamplingParams,
    TopPSamplingStrategy,
 )
@ -163,7 +165,8 @@ class LlamaGenerator:
        sampling_params = SamplingParams()
        if request.temperature is not None or request.top_p is not None:
            sampling_params.strategy = TopPSamplingStrategy(
-                temperature=request.temperature or 1.0, top_p=request.top_p or 1.0
+                temperature=request.temperature if request.temperature is not None else 1.0,
+                top_p=request.top_p if request.top_p is not None else 1.0,
            )
        if request.max_tokens:
            sampling_params.max_tokens = request.max_tokens
@ -177,9 +180,12 @@ class LlamaGenerator:
        # Get logits processor for response format
        logits_processor = None
        if request.response_format:
-            if isinstance(request.response_format, dict) and request.response_format.get("type") == "json_schema":
+            if isinstance(request.response_format, OpenAIResponseFormatJSONSchema):
+                # Extract the actual schema from OpenAIJSONSchema TypedDict
+                schema_dict = request.response_format.json_schema.get("schema") or {}
                json_schema_format = JsonSchemaResponseFormat(
-                    type="json_schema", json_schema=request.response_format.get("json_schema", {})
+                    type=ResponseFormatType.json_schema,
+                    json_schema=schema_dict,
                )
                logits_processor = get_logits_processor(self.tokenizer, self.args.vocab_size, json_schema_format)

--- a/src/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -16,6 +16,8 @@ from llama_stack.apis.inference import (
    OpenAIChatCompletionUsage,
    OpenAIChoice,
    OpenAICompletionRequestWithExtraBody,
+    OpenAIUserMessageParam,
+    ToolChoice,
 )
 from llama_stack.apis.inference.inference import (
    OpenAIChatCompletion,
@ -24,12 +26,20 @@ from llama_stack.apis.inference.inference import (
 )
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.log import get_logger
+from llama_stack.models.llama.datatypes import RawMessage, RawTextItem, ToolDefinition
 from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
+from llama_stack.models.llama.llama3.prompt_templates import (
+    JsonCustomToolGenerator,
+    SystemDefaultGenerator,
+)
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
 from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
+from llama_stack.models.llama.llama4.prompt_templates.system_prompts import (
+    PythonListCustomToolGenerator as PythonListCustomToolGeneratorLlama4,
+)
 from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
 from llama_stack.models.llama.sku_list import resolve_model
-from llama_stack.models.llama.sku_types import ModelFamily
+from llama_stack.models.llama.sku_types import ModelFamily, is_multimodal
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
    SentenceTransformerEmbeddingMixin,
@ -49,6 +59,170 @@ log = get_logger(__name__, category="inference")
 SEMAPHORE = asyncio.Semaphore(1)


+def _convert_openai_tool_to_tool_definition(tool) -> ToolDefinition:
+    """Convert OpenAI tool format to ToolDefinition format."""
+    # OpenAI tools have function.name and function.parameters
+    return ToolDefinition(
+        tool_name=tool.function.name,
+        description=tool.function.description or "",
+        parameters=tool.function.parameters or {},
+    )
+
+
+def _get_tool_choice_prompt(tool_choice, tools) -> str:
+    """Generate prompt text for tool_choice behavior."""
+    if not tool_choice or tool_choice == ToolChoice.auto or tool_choice == "auto":
+        return ""
+    elif tool_choice == ToolChoice.required or tool_choice == "required":
+        return "You MUST use one of the provided functions/tools to answer the user query."
+    elif tool_choice == ToolChoice.none or tool_choice == "none":
+        return ""
+    else:
+        # Specific tool specified
+        return f"You MUST use the tool `{tool_choice}` to answer the user query."
+
+
+def _raw_content_as_str(content) -> str:
+    """Convert RawContent to string for system messages."""
+    if isinstance(content, str):
+        return content
+    elif isinstance(content, RawTextItem):
+        return content.text
+    elif isinstance(content, list):
+        return "\n".join(_raw_content_as_str(c) for c in content)
+    else:
+        return "<media>"
+
+
+def _augment_raw_messages_for_tools_llama_3_1(
+    raw_messages: list[RawMessage],
+    tools: list,
+    tool_choice,
+) -> list[RawMessage]:
+    """Augment raw messages with tool definitions for Llama 3.1 style models."""
+    messages = raw_messages.copy()
+    existing_system_message = None
+    if messages and messages[0].role == "system":
+        existing_system_message = messages.pop(0)
+
+    sys_content = ""
+
+    # Add tool definitions first (if present)
+    if tools:
+        # Convert OpenAI tools to ToolDefinitions
+        tool_definitions = [_convert_openai_tool_to_tool_definition(t) for t in tools]
+
+        # For OpenAI format, all tools are custom (have string names)
+        tool_gen = JsonCustomToolGenerator()
+        tool_template = tool_gen.gen(tool_definitions)
+        sys_content += tool_template.render()
+        sys_content += "\n"
+
+    # Add default system prompt
+    default_gen = SystemDefaultGenerator()
+    default_template = default_gen.gen()
+    sys_content += default_template.render()
+
+    # Add existing system message if present
+    if existing_system_message:
+        sys_content += "\n" + _raw_content_as_str(existing_system_message.content)
+
+    # Add tool choice prompt if needed
+    if tool_choice_prompt := _get_tool_choice_prompt(tool_choice, tools):
+        sys_content += "\n" + tool_choice_prompt
+
+    # Create new system message
+    new_system_message = RawMessage(
+        role="system",
+        content=[RawTextItem(text=sys_content.strip())],
+    )
+
+    return [new_system_message] + messages
+
+
+def _augment_raw_messages_for_tools_llama_4(
+    raw_messages: list[RawMessage],
+    tools: list,
+    tool_choice,
+) -> list[RawMessage]:
+    """Augment raw messages with tool definitions for Llama 4/3.2/3.3 style models."""
+    messages = raw_messages.copy()
+    existing_system_message = None
+    if messages and messages[0].role == "system":
+        existing_system_message = messages.pop(0)
+
+    sys_content = ""
+
+    # Add tool definitions if present
+    if tools:
+        # Convert OpenAI tools to ToolDefinitions
+        tool_definitions = [_convert_openai_tool_to_tool_definition(t) for t in tools]
+
+        # Use python_list format for Llama 4
+        tool_gen = PythonListCustomToolGeneratorLlama4()
+        system_prompt = None
+        if existing_system_message:
+            system_prompt = _raw_content_as_str(existing_system_message.content)
+
+        tool_template = tool_gen.gen(tool_definitions, system_prompt)
+        sys_content = tool_template.render()
+    elif existing_system_message:
+        # No tools, just use existing system message
+        sys_content = _raw_content_as_str(existing_system_message.content)
+
+    # Add tool choice prompt if needed
+    if tool_choice_prompt := _get_tool_choice_prompt(tool_choice, tools):
+        sys_content += "\n" + tool_choice_prompt
+
+    if sys_content:
+        new_system_message = RawMessage(
+            role="system",
+            content=[RawTextItem(text=sys_content.strip())],
+        )
+        return [new_system_message] + messages
+
+    return messages
+
+
+def augment_raw_messages_for_tools(
+    raw_messages: list[RawMessage],
+    params: OpenAIChatCompletionRequestWithExtraBody,
+    llama_model,
+) -> list[RawMessage]:
+    """Augment raw messages with tool definitions based on model family."""
+    if not params.tools:
+        return raw_messages
+
+    # Determine augmentation strategy based on model family
+    if llama_model.model_family == ModelFamily.llama3_1 or (
+        llama_model.model_family == ModelFamily.llama3_2 and is_multimodal(llama_model.core_model_id)
+    ):
+        # Llama 3.1 and Llama 3.2 multimodal use JSON format
+        return _augment_raw_messages_for_tools_llama_3_1(
+            raw_messages,
+            params.tools,
+            params.tool_choice,
+        )
+    elif llama_model.model_family in (
+        ModelFamily.llama3_2,
+        ModelFamily.llama3_3,
+        ModelFamily.llama4,
+    ):
+        # Llama 3.2/3.3/4 use python_list format
+        return _augment_raw_messages_for_tools_llama_4(
+            raw_messages,
+            params.tools,
+            params.tool_choice,
+        )
+    else:
+        # Default to Llama 3.1 style
+        return _augment_raw_messages_for_tools_llama_3_1(
+            raw_messages,
+            params.tools,
+            params.tool_choice,
+        )
+
+
 def llama_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama_model: Model) -> LlamaGenerator:
    return LlamaGenerator(config, model_id, llama_model)

@ -141,7 +315,6 @@ class MetaReferenceInferenceImpl(
        self.llama_model = llama_model

        log.info("Warming up...")
-        from llama_stack.apis.inference import OpenAIUserMessageParam

        await self.openai_chat_completion(
            params=OpenAIChatCompletionRequestWithExtraBody(
@ -167,10 +340,17 @@ class MetaReferenceInferenceImpl(
        self.check_model(params)

        # Convert OpenAI messages to RawMessages
-        from llama_stack.providers.utils.inference.prompt_adapter import convert_openai_message_to_raw_message
+        from llama_stack.models.llama.datatypes import StopReason
+        from llama_stack.providers.utils.inference.prompt_adapter import (
+            convert_openai_message_to_raw_message,
+            decode_assistant_message,
+        )

        raw_messages = [await convert_openai_message_to_raw_message(msg) for msg in params.messages]

+        # Augment messages with tool definitions if tools are present
+        raw_messages = augment_raw_messages_for_tools(raw_messages, params, self.llama_model)
+
        # Call generator's chat_completion method (works for both single-GPU and model-parallel)
        if isinstance(self.generator, LlamaGenerator):
            generator = self.generator.chat_completion(params, raw_messages)
@ -178,14 +358,56 @@ class MetaReferenceInferenceImpl(
            # Model parallel: submit task to process group
            generator = self.generator.group.run_inference(("chat_completion", [params, raw_messages]))

-        # Collect all generated text
+        # Check if streaming is requested
+        if params.stream:
+            return self._stream_chat_completion(generator, params)
+
+        # Non-streaming: collect all generated text
        generated_text = ""
        for result_batch in generator:
            for result in result_batch:
                if not result.ignore_token and result.source == "output":
                    generated_text += result.text

+        # Decode assistant message to extract tool calls and determine stop_reason
+        # Default to end_of_turn if generation completed normally
+        decoded_message = decode_assistant_message(generated_text, StopReason.end_of_turn)
+
+        # Convert tool calls to OpenAI format
+        openai_tool_calls = None
+        if decoded_message.tool_calls:
+            from llama_stack.apis.inference import (
+                OpenAIChatCompletionToolCall,
+                OpenAIChatCompletionToolCallFunction,
+            )
+
+            openai_tool_calls = [
+                OpenAIChatCompletionToolCall(
+                    # generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
+                    id=f"call_{uuid.uuid4().hex[:24]}",
+                    type="function",
+                    function=OpenAIChatCompletionToolCallFunction(
+                        name=str(tc.tool_name),
+                        arguments=tc.arguments,
+                    ),
+                )
+                for tc in decoded_message.tool_calls
+            ]
+
+        # Determine finish_reason based on whether tool calls are present
+        finish_reason = "tool_calls" if openai_tool_calls else "stop"
+
+        # Extract content from decoded message
+        content = ""
+        if isinstance(decoded_message.content, str):
+            content = decoded_message.content
+        elif isinstance(decoded_message.content, list):
+            for item in decoded_message.content:
+                if isinstance(item, RawTextItem):
+                    content += item.text
+
        # Create OpenAI response
+        # generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
        response_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
        created = int(time.time())

@ -199,9 +421,10 @@ class MetaReferenceInferenceImpl(
                    index=0,
                    message=OpenAIAssistantMessageParam(
                        role="assistant",
-                        content=generated_text,
+                        content=content,
+                        tool_calls=openai_tool_calls,
                    ),
-                    finish_reason="stop",
+                    finish_reason=finish_reason,
                    logprobs=None,
                )
            ],
@ -211,3 +434,110 @@ class MetaReferenceInferenceImpl(
                total_tokens=0,  # TODO: calculate properly
            ),
        )
+
+    async def _stream_chat_completion(
+        self,
+        generator,
+        params: OpenAIChatCompletionRequestWithExtraBody,
+    ) -> AsyncIterator[OpenAIChatCompletionChunk]:
+        """Stream chat completion chunks as they're generated."""
+        from llama_stack.apis.inference import (
+            OpenAIChatCompletionChunk,
+            OpenAIChatCompletionToolCall,
+            OpenAIChatCompletionToolCallFunction,
+            OpenAIChoiceDelta,
+            OpenAIChunkChoice,
+        )
+        from llama_stack.models.llama.datatypes import StopReason
+        from llama_stack.providers.utils.inference.prompt_adapter import decode_assistant_message
+
+        response_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
+        created = int(time.time())
+        generated_text = ""
+
+        # Yield chunks as tokens are generated
+        for result_batch in generator:
+            for result in result_batch:
+                if result.ignore_token or result.source != "output":
+                    continue
+
+                generated_text += result.text
+
+                # Yield delta chunk with the new text
+                chunk = OpenAIChatCompletionChunk(
+                    id=response_id,
+                    object="chat.completion.chunk",
+                    created=created,
+                    model=params.model,
+                    choices=[
+                        OpenAIChunkChoice(
+                            index=0,
+                            delta=OpenAIChoiceDelta(
+                                role="assistant",
+                                content=result.text,
+                            ),
+                            finish_reason="",
+                            logprobs=None,
+                        )
+                    ],
+                )
+                yield chunk
+
+        # After generation completes, decode the full message to extract tool calls
+        decoded_message = decode_assistant_message(generated_text, StopReason.end_of_turn)
+
+        # If tool calls are present, yield a final chunk with tool_calls
+        if decoded_message.tool_calls:
+            openai_tool_calls = [
+                OpenAIChatCompletionToolCall(
+                    # generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
+                    id=f"call_{uuid.uuid4().hex[:24]}",
+                    type="function",
+                    function=OpenAIChatCompletionToolCallFunction(
+                        name=str(tc.tool_name),
+                        arguments=tc.arguments,
+                    ),
+                )
+                for tc in decoded_message.tool_calls
+            ]
+
+            # Yield chunk with tool_calls
+            chunk = OpenAIChatCompletionChunk(
+                id=response_id,
+                object="chat.completion.chunk",
+                created=created,
+                model=params.model,
+                choices=[
+                    OpenAIChunkChoice(
+                        index=0,
+                        delta=OpenAIChoiceDelta(
+                            role="assistant",
+                            tool_calls=openai_tool_calls,
+                        ),
+                        finish_reason="",
+                        logprobs=None,
+                    )
+                ],
+            )
+            yield chunk
+
+            finish_reason = "tool_calls"
+        else:
+            finish_reason = "stop"
+
+        # Yield final chunk with finish_reason
+        final_chunk = OpenAIChatCompletionChunk(
+            id=response_id,
+            object="chat.completion.chunk",
+            created=created,
+            model=params.model,
+            choices=[
+                OpenAIChunkChoice(
+                    index=0,
+                    delta=OpenAIChoiceDelta(),
+                    finish_reason=finish_reason,
+                    logprobs=None,
+                )
+            ],
+        )
+        yield final_chunk