feat: add a configurable category-based logger (#1352)

A self-respecting server needs good observability which starts with configurable logging. Llama Stack had little until now. This PR adds a `logcat` facility towards that. Callsites look like: ```python logcat.debug("inference", f"params to ollama: {params}") ``` - the first parameter is a category. there is a static list of categories in `llama_stack/logcat.py` - each category can be associated with a log-level which can be configured via the `LLAMA_STACK_LOGGING` env var. - a value `LLAMA_STACK_LOGGING=inference=debug;server=info"` does the obvious thing. there is a special key called `all` which is an alias for all categories ## Test Plan Ran with `LLAMA_STACK_LOGGING="all=debug" llama stack run fireworks` and saw the following: ![image](https://github.com/user-attachments/assets/d24b95ab-3941-426c-9ea0-a4c62542e6f0) Hit it with a client-sdk test case and saw this: ![image](https://github.com/user-attachments/assets/3fee8c6c-986e-4125-a09c-f5dc019682e2)
2025-12-03 09:53:45 +00:00 · 2025-03-02 18:51:14 -08:00 · 2025-03-02 18:51:14 -08:00 · 754feba61f
commit 754feba61f
parent a9a7b11326
12 changed files with 409 additions and 47 deletions
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -8,6 +8,7 @@ from typing import AsyncGenerator, List, Optional, Union

 from fireworks.client import Fireworks

+from llama_stack import logcat
 from llama_stack.apis.common.content_types import (
    InterleavedContent,
    InterleavedContentItem,
@ -226,12 +227,14 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
            if input_dict["prompt"].startswith("<|begin_of_text|>"):
                input_dict["prompt"] = input_dict["prompt"][len("<|begin_of_text|>") :]

-        return {
+        params = {
            "model": request.model,
            **input_dict,
            "stream": request.stream,
            **self._build_options(request.sampling_params, request.response_format, request.logprobs),
        }
+        logcat.debug("inference", f"params to fireworks: {params}")
+        return params

    async def embeddings(
        self,
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -10,6 +10,7 @@ from typing import AsyncGenerator, List, Optional, Union
 import httpx
 from ollama import AsyncClient

+from llama_stack import logcat
 from llama_stack.apis.common.content_types import (
    ImageContentItem,
    InterleavedContent,
@ -203,12 +204,14 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
            else:
                raise ValueError(f"Unknown response format type: {fmt.type}")

-        return {
+        params = {
            "model": request.model,
            **input_dict,
            "options": sampling_options,
            "stream": request.stream,
        }
+        logcat.debug("inference", f"params to ollama: {params}")
+        return params

    async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
        params = await self._get_params(request)
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -8,6 +8,7 @@ from typing import AsyncGenerator, List, Optional, Union

 from together import Together

+from llama_stack import logcat
 from llama_stack.apis.common.content_types import (
    InterleavedContent,
    InterleavedContentItem,
@ -213,12 +214,14 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
            assert not media_present, "Together does not support media for Completion requests"
            input_dict["prompt"] = await completion_request_to_prompt(request)

-        return {
+        params = {
            "model": request.model,
            **input_dict,
            "stream": request.stream,
            **self._build_options(request.sampling_params, request.logprobs, request.response_format),
        }
+        logcat.debug("inference", f"params to together: {params}")
+        return params

    async def embeddings(
        self,