chore: make cprint write to stderr (#2250)

Also do sys.exit(1) in case of errors
2025-05-24 23:39:57 -07:00 · 2025-05-24 23:39:57 -07:00 · 5a422e236c
commit 5a422e236c
parent c25bd0ad58
11 changed files with 81 additions and 44 deletions
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -6,6 +6,7 @@

 import asyncio
 import os
+import sys
 from collections.abc import AsyncGenerator

 from pydantic import BaseModel
@ -455,9 +456,9 @@ class MetaReferenceInferenceImpl(
                first = token_results[0]
                if not first.finished and not first.ignore_token:
                    if os.environ.get("LLAMA_MODELS_DEBUG", "0") in ("1", "2"):
-                        cprint(first.text, "cyan", end="")
+                        cprint(first.text, color="cyan", end="", file=sys.stderr)
                    if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "2":
-                        cprint(f"<{first.token}>", "magenta", end="")
+                        cprint(f"<{first.token}>", color="magenta", end="", file=sys.stderr)

                for result in token_results:
                    idx = result.batch_idx
@ -519,9 +520,9 @@ class MetaReferenceInferenceImpl(
            for token_results in self.generator.chat_completion([request]):
                token_result = token_results[0]
                if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1":
-                    cprint(token_result.text, "cyan", end="")
+                    cprint(token_result.text, color="cyan", end="", file=sys.stderr)
                if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "2":
-                    cprint(f"<{token_result.token}>", "magenta", end="")
+                    cprint(f"<{token_result.token}>", color="magenta", end="", file=sys.stderr)

                if token_result.token == tokenizer.eot_id:
                    stop_reason = StopReason.end_of_turn