final fixes

2025-08-05 10:13:05 +00:00 · 2025-04-06 18:01:43 -07:00 · 2025-04-06 18:01:43 -07:00 · d3ebc18559
commit d3ebc18559
parent 971566fd74
3 changed files with 11 additions and 15 deletions
--- a/llama_stack/models/llama/llama4/chat_format.py
+++ b/llama_stack/models/llama/llama4/chat_format.py
@ -203,14 +203,8 @@ class ChatFormat:
            tokens.extend(toks)
            images.extend(imgs)

-        # if message.role == "assistant" and len(message.tool_calls) > 0:
-        #     tokens.append(self.tokenizer.special_tokens["<|python_start|>"])
-
        _process_content(message.content)

-        # if message.role == "assistant" and len(message.tool_calls) > 0:
-        #     tokens.append(self.tokenizer.special_tokens["<|python_end|>"])
-
        if message.role == "user" and message.context is not None:
            # This is RAG context; why is it here in the chat format? I don't think
            # this is needed and can be moved upwards
@ -222,6 +216,7 @@ class ChatFormat:
                content = ToolUtils.encode_tool_call(t, tool_prompt_format)
                _process_content(content)

+        # Tool calls and Tool Response messages should be eom
        eom = False
        if message.role == "assistant":
            eom = message.stop_reason == StopReason.end_of_message or message.tool_calls
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -6,8 +6,11 @@

 import asyncio
 import logging
+import os
 from typing import AsyncGenerator, List, Optional, Union

+from termcolor import cprint
+
 from llama_stack.apis.common.content_types import (
    TextDelta,
    ToolCallDelta,
@ -338,9 +341,8 @@ class MetaReferenceInferenceImpl(
            stop_reason = None

            for token_result in self.generator.chat_completion(request):
-                from termcolor import cprint
-
-                cprint(token_result.text, "cyan", end="")
+                if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1":
+                    cprint(token_result.text, "cyan", end="")

                tokens.append(token_result.token)

@ -390,9 +392,8 @@ class MetaReferenceInferenceImpl(
            ipython = False

            for token_result in self.generator.chat_completion(request):
-                from termcolor import cprint
-
-                cprint(token_result.text, "cyan", end="")
+                if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1":
+                    cprint(token_result.text, "cyan", end="")

                tokens.append(token_result.token)

--- a/tests/integration/test_cases/inference/chat_completion.json
+++ b/tests/integration/test_cases/inference/chat_completion.json
@ -128,7 +128,7 @@
      ],
      "tool_responses": [
        {
-          "response": "{'resposne': '70 degrees and foggy'}"
+          "response": "{'response': '70 degrees and foggy'}"
        }
      ],
      "expected": [
@ -174,7 +174,7 @@
      ],
      "tool_responses": [
        {
-          "response": "{'resposne': '70 degrees and foggy'}"
+          "response": "{'response': '70 degrees and foggy'}"
        }
      ],
      "expected": [
@ -398,7 +398,7 @@
          "response": "{'response': 'Total expenses for January 2025: $1000'}"
        },
        {
-          "response": "{'resposne': 'Total expenses for February 2024: $2000'}"
+          "response": "{'response': 'Total expenses for February 2024: $2000'}"
        }
      ],
      "expected": [