diff --git a/llama_stack/models/llama/llama4/chat_format.py b/llama_stack/models/llama/llama4/chat_format.py index 75d7cbc0e..c873012d6 100644 --- a/llama_stack/models/llama/llama4/chat_format.py +++ b/llama_stack/models/llama/llama4/chat_format.py @@ -203,14 +203,8 @@ class ChatFormat: tokens.extend(toks) images.extend(imgs) - # if message.role == "assistant" and len(message.tool_calls) > 0: - # tokens.append(self.tokenizer.special_tokens["<|python_start|>"]) - _process_content(message.content) - # if message.role == "assistant" and len(message.tool_calls) > 0: - # tokens.append(self.tokenizer.special_tokens["<|python_end|>"]) - if message.role == "user" and message.context is not None: # This is RAG context; why is it here in the chat format? I don't think # this is needed and can be moved upwards @@ -222,6 +216,7 @@ class ChatFormat: content = ToolUtils.encode_tool_call(t, tool_prompt_format) _process_content(content) + # Tool calls and Tool Response messages should be eom eom = False if message.role == "assistant": eom = message.stop_reason == StopReason.end_of_message or message.tool_calls diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index 029314d88..da217728b 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -6,8 +6,11 @@ import asyncio import logging +import os from typing import AsyncGenerator, List, Optional, Union +from termcolor import cprint + from llama_stack.apis.common.content_types import ( TextDelta, ToolCallDelta, @@ -338,9 +341,8 @@ class MetaReferenceInferenceImpl( stop_reason = None for token_result in self.generator.chat_completion(request): - from termcolor import cprint - - cprint(token_result.text, "cyan", end="") + if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1": + cprint(token_result.text, "cyan", end="") tokens.append(token_result.token) @@ -390,9 +392,8 @@ class MetaReferenceInferenceImpl( ipython = False for token_result in self.generator.chat_completion(request): - from termcolor import cprint - - cprint(token_result.text, "cyan", end="") + if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1": + cprint(token_result.text, "cyan", end="") tokens.append(token_result.token) diff --git a/tests/integration/test_cases/inference/chat_completion.json b/tests/integration/test_cases/inference/chat_completion.json index f842bca9a..c84d29e64 100644 --- a/tests/integration/test_cases/inference/chat_completion.json +++ b/tests/integration/test_cases/inference/chat_completion.json @@ -128,7 +128,7 @@ ], "tool_responses": [ { - "response": "{'resposne': '70 degrees and foggy'}" + "response": "{'response': '70 degrees and foggy'}" } ], "expected": [ @@ -174,7 +174,7 @@ ], "tool_responses": [ { - "response": "{'resposne': '70 degrees and foggy'}" + "response": "{'response': '70 degrees and foggy'}" } ], "expected": [ @@ -398,7 +398,7 @@ "response": "{'response': 'Total expenses for January 2025: $1000'}" }, { - "response": "{'resposne': 'Total expenses for February 2024: $2000'}" + "response": "{'response': 'Total expenses for February 2024: $2000'}" } ], "expected": [