From e13c92f269cc1cc404f39a20334217ba9e7e19d7 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 11 Mar 2025 09:58:25 -0700
Subject: [PATCH 01/11] revert: feat(server): Use system packages for execution
 (#1551)

Reverts meta-llama/llama-stack#1252

The above PR breaks the following invocation:
```bash
llama stack run ~/.llama/distributions/together/together-run.yaml
```
---
 llama_stack/cli/stack/run.py              | 52 +++++++----------------
 llama_stack/distribution/server/server.py | 28 +++---------
 2 files changed, 21 insertions(+), 59 deletions(-)
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index 1e4f3c5d9..e5686fb10 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -56,6 +56,7 @@ class StackRun(Subcommand):
             "--env",
             action="append",
             help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
+            default=[],
             metavar="KEY=VALUE",
         )
         self.parser.add_argument(
@@ -73,6 +74,7 @@ class StackRun(Subcommand):
             type=str,
             help="Image Type used during the build. This can be either conda or container or venv.",
             choices=["conda", "container", "venv"],
+            default="conda",
         )
 
     def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
@@ -118,42 +120,20 @@ class StackRun(Subcommand):
         except AttributeError as e:
             self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
 
-        # If neither image type nor image name is provided, assume the server should be run directly
-        # using the current environment packages.
-        if not args.image_type and not args.image_name:
-            logger.info("No image type or image name provided. Assuming environment packages.")
-            from llama_stack.distribution.server.server import main as server_main
+        run_args = formulate_run_args(args.image_type, args.image_name, config, template_name)
 
-            # Build the server args from the current args passed to the CLI
-            server_args = argparse.Namespace()
-            for arg in vars(args):
-                # If this is a function, avoid passing it
-                # "args" contains:
-                # func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
-                if callable(getattr(args, arg)):
-                    continue
-                setattr(server_args, arg, getattr(args, arg))
+        run_args.extend([str(config_file), str(args.port)])
+        if args.disable_ipv6:
+            run_args.append("--disable-ipv6")
 
-            # Run the server
-            server_main(server_args)
-        else:
-            run_args = formulate_run_args(args.image_type, args.image_name, config, template_name)
+        for env_var in args.env:
+            if "=" not in env_var:
+                self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
+            key, value = env_var.split("=", 1)  # split on first = only
+            if not key:
+                self.parser.error(f"Environment variable '{env_var}' has empty key")
+            run_args.extend(["--env", f"{key}={value}"])
 
-            run_args.extend([str(config_file), str(args.port)])
-            if args.disable_ipv6:
-                run_args.append("--disable-ipv6")
-
-            if args.env:
-                for env_var in args.env:
-                    if "=" not in env_var:
-                        self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
-                        return
-                    key, value = env_var.split("=", 1)  # split on first = only
-                    if not key:
-                        self.parser.error(f"Environment variable '{env_var}' has empty key")
-                        return
-                    run_args.extend(["--env", f"{key}={value}"])
-
-            if args.tls_keyfile and args.tls_certfile:
-                run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
-            run_with_pty(run_args)
+        if args.tls_keyfile and args.tls_certfile:
+            run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
+        run_with_pty(run_args)
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 6b99d908d..f819d446f 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -17,7 +17,7 @@ import warnings
 from contextlib import asynccontextmanager
 from importlib.metadata import version as parse_version
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import Any, List, Union
 
 import yaml
 from fastapi import Body, FastAPI, HTTPException, Request
@@ -314,17 +314,11 @@ class ClientVersionMiddleware:
         return await self.app(scope, receive, send)
 
 
-def main(args: Optional[argparse.Namespace] = None):
+def main():
     """Start the LlamaStack server."""
     parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
     parser.add_argument(
         "--yaml-config",
-        dest="config",
-        help="(Deprecated) Path to YAML configuration file - use --config instead",
-    )
-    parser.add_argument(
-        "--config",
-        dest="config",
         help="Path to YAML configuration file",
     )
     parser.add_argument(
@@ -354,19 +348,7 @@ def main(args: Optional[argparse.Namespace] = None):
         required="--tls-keyfile" in sys.argv,
     )
 
-    # Determine whether the server args are being passed by the "run" command, if this is the case
-    # the args will be passed as a Namespace object to the main function, otherwise they will be
-    # parsed from the command line
-    if args is None:
-        args = parser.parse_args()
-
-    # Check for deprecated argument usage
-    if "--yaml-config" in sys.argv:
-        warnings.warn(
-            "The '--yaml-config' argument is deprecated and will be removed in a future version. Use '--config' instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
+    args = parser.parse_args()
 
     if args.env:
         for env_pair in args.env:
@@ -378,9 +360,9 @@ def main(args: Optional[argparse.Namespace] = None):
                 logger.error(f"Error: {str(e)}")
                 sys.exit(1)
 
-    if args.config:
+    if args.yaml_config:
         # if the user provided a config file, use it, even if template was specified
-        config_file = Path(args.config)
+        config_file = Path(args.yaml_config)
         if not config_file.exists():
             raise ValueError(f"Config file {config_file} does not exist")
         logger.info(f"Using config file: {config_file}")

From 0e73186a114a253a24a7638c1b6b9ad6e54b6e59 Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Tue, 11 Mar 2025 13:01:09 -0400
Subject: [PATCH 02/11] fix: Add missing shutdown handler for
 TorchtunePostTrainingImpl (#1535)

# What does this PR do?

Added missing shutdown handler. (Currently empty.)

Without it, when server shuts down, it posts the following warning:

```
__main__:129 server: No shutdown method for TorchtunePostTrainingImpl
```

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>


[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

(The test plan assumes shutdown logic is fixed, see #1495)

Without the patch:

```
INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO     2025-03-10 20:56:43,961 __main__:140 server: Shutting down
INFO     2025-03-10 20:56:43,962 __main__:124 server: Shutting down DatasetsRoutingTable
INFO     2025-03-10 20:56:43,964 __main__:124 server: Shutting down DatasetIORouter
INFO     2025-03-10 20:56:43,965 __main__:124 server: Shutting down ScoringFunctionsRoutingTable
INFO     2025-03-10 20:56:43,966 __main__:124 server: Shutting down ScoringRouter
INFO     2025-03-10 20:56:43,967 __main__:124 server: Shutting down ModelsRoutingTable
INFO     2025-03-10 20:56:43,968 __main__:124 server: Shutting down InferenceRouter
INFO     2025-03-10 20:56:43,969 __main__:124 server: Shutting down ShieldsRoutingTable
INFO     2025-03-10 20:56:43,971 __main__:124 server: Shutting down SafetyRouter
INFO     2025-03-10 20:56:43,972 __main__:124 server: Shutting down VectorDBsRoutingTable
INFO     2025-03-10 20:56:43,973 __main__:124 server: Shutting down VectorIORouter
INFO     2025-03-10 20:56:43,974 __main__:124 server: Shutting down ToolGroupsRoutingTable
INFO     2025-03-10 20:56:43,975 __main__:124 server: Shutting down ToolRuntimeRouter
INFO     2025-03-10 20:56:43,976 __main__:124 server: Shutting down MetaReferenceAgentsImpl
INFO     2025-03-10 20:56:43,977 __main__:124 server: Shutting down TelemetryAdapter
INFO     2025-03-10 20:56:43,978 __main__:124 server: Shutting down TorchtunePostTrainingImpl
WARNING  2025-03-10 20:56:43,979 __main__:129 server: No shutdown method for TorchtunePostTrainingImpl
INFO     2025-03-10 20:56:43,979 __main__:124 server: Shutting down BenchmarksRoutingTable
INFO     2025-03-10 20:56:43,980 __main__:124 server: Shutting down EvalRouter
INFO     2025-03-10 20:56:43,981 __main__:124 server: Shutting down DistributionInspectImpl
INFO:     Application shutdown complete.
INFO:     Finished server process [33862]
```

Run with the patch and observe no warning:

```
$ kill -INT $(ps ax | grep  llama_stack.distribution.server.server | grep -v nvim | awk -e '{print $1}' | sort | head -n 1)
```

```
INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO     2025-03-11 00:32:56,863 __main__:140 server: Shutting down
INFO     2025-03-11 00:32:56,864 __main__:124 server: Shutting down DatasetsRoutingTable
INFO     2025-03-11 00:32:56,866 __main__:124 server: Shutting down DatasetIORouter
INFO     2025-03-11 00:32:56,867 __main__:124 server: Shutting down ScoringFunctionsRoutingTable
INFO     2025-03-11 00:32:56,868 __main__:124 server: Shutting down ScoringRouter
INFO     2025-03-11 00:32:56,869 __main__:124 server: Shutting down ModelsRoutingTable
INFO     2025-03-11 00:32:56,870 __main__:124 server: Shutting down InferenceRouter
INFO     2025-03-11 00:32:56,871 __main__:124 server: Shutting down ShieldsRoutingTable
INFO     2025-03-11 00:32:56,872 __main__:124 server: Shutting down SafetyRouter
INFO     2025-03-11 00:32:56,873 __main__:124 server: Shutting down VectorDBsRoutingTable
INFO     2025-03-11 00:32:56,874 __main__:124 server: Shutting down VectorIORouter
INFO     2025-03-11 00:32:56,875 __main__:124 server: Shutting down ToolGroupsRoutingTable
INFO     2025-03-11 00:32:56,876 __main__:124 server: Shutting down ToolRuntimeRouter
INFO     2025-03-11 00:32:56,877 __main__:124 server: Shutting down MetaReferenceAgentsImpl
INFO     2025-03-11 00:32:56,878 __main__:124 server: Shutting down TelemetryAdapter
INFO     2025-03-11 00:32:56,879 __main__:124 server: Shutting down TorchtunePostTrainingImpl
INFO     2025-03-11 00:32:56,880 __main__:124 server: Shutting down BenchmarksRoutingTable
INFO     2025-03-11 00:32:56,881 __main__:124 server: Shutting down EvalRouter
INFO     2025-03-11 00:32:56,882 __main__:124 server: Shutting down DistributionInspectImpl

```

[//]: # (## Documentation)

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 .../providers/inline/post_training/torchtune/post_training.py  | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llama_stack/providers/inline/post_training/torchtune/post_training.py b/llama_stack/providers/inline/post_training/torchtune/post_training.py
index b837362d7..3a1affc91 100644
--- a/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py
@@ -43,6 +43,9 @@ class TorchtunePostTrainingImpl:
         self.jobs = {}
         self.checkpoints_dict = {}
 
+    async def shutdown(self):
+        pass
+
     async def supervised_fine_tune(
         self,
         job_uuid: str,

From 04106b94aab2bf550a39047370bf75e724b4114f Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Tue, 11 Mar 2025 13:01:46 -0400
Subject: [PATCH 03/11] docs: Remove duplicate docs on api docs generator
 (#1534)

# What does this PR do?

Since #892, we also need to install ruamel. Instead of maintaining the
list of script dependencies in multiple places, remove it and assume
developers read CONTRIBUTING.md docs.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Just docs.

[//]: # (## Documentation)

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 CONTRIBUTING.md                  | 3 +--
 docs/openapi_generator/README.md | 8 --------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e639328f0..7c0b5d94e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -159,8 +159,7 @@ uv run sphinx-autobuild source build/html --write-all
 If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
 
 ```bash
-uv sync --extra dev
-uv run ./docs/openapi_generator/run_openapi_generator.sh
+uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh
 ```
 
 The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
diff --git a/docs/openapi_generator/README.md b/docs/openapi_generator/README.md
index 298df3ce0..7888e7828 100644
--- a/docs/openapi_generator/README.md
+++ b/docs/openapi_generator/README.md
@@ -1,9 +1 @@
 The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
-
-Please install the following packages before running the script:
-
-```
-pip install fire PyYAML
-```
-
-Then simply run `sh run_openapi_generator.sh`

From c3d7d17bc4c4d815537a8ca7a5530139dd93c664 Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Tue, 11 Mar 2025 13:07:28 -0400
Subject: [PATCH 04/11] chore: fix typing hints for get_provider_impl deps
 arguments (#1544)

# What does this PR do?

It's a dict that may contain different types, as per
resolver:instantiate_provider implementation. (AFAIU it also never
contains ProviderSpecs, but *instances* of provider implementations.)

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

mypy passing if enabled checks for these modules. (See #1543)

[//]: # (## Documentation)

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 .../providers/inline/agents/meta_reference/__init__.py      | 6 +++---
 llama_stack/providers/inline/datasetio/localfs/__init__.py  | 4 +++-
 .../providers/inline/eval/meta_reference/__init__.py        | 6 +++---
 .../providers/inline/inference/meta_reference/__init__.py   | 4 ++--
 .../inline/inference/sentence_transformers/__init__.py      | 4 +++-
 llama_stack/providers/inline/inference/vllm/__init__.py     | 4 ++--
 .../providers/inline/post_training/torchtune/__init__.py    | 6 +++---
 .../providers/inline/safety/code_scanner/__init__.py        | 4 +++-
 llama_stack/providers/inline/safety/llama_guard/__init__.py | 4 +++-
 .../providers/inline/safety/prompt_guard/__init__.py        | 4 +++-
 llama_stack/providers/inline/scoring/basic/__init__.py      | 6 +++---
 llama_stack/providers/inline/scoring/braintrust/__init__.py | 6 +++---
 .../providers/inline/scoring/llm_as_judge/__init__.py       | 6 +++---
 .../inline/tool_runtime/code_interpreter/__init__.py        | 4 +++-
 llama_stack/providers/inline/vector_io/chroma/__init__.py   | 6 +++---
 llama_stack/providers/inline/vector_io/faiss/__init__.py    | 6 +++---
 llama_stack/providers/inline/vector_io/milvus/__init__.py   | 6 +++---
 .../providers/inline/vector_io/sqlite_vec/__init__.py       | 6 +++---
 18 files changed, 52 insertions(+), 40 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/__init__.py b/llama_stack/providers/inline/agents/meta_reference/__init__.py
index 8f8c24170..4be064f1d 100644
--- a/llama_stack/providers/inline/agents/meta_reference/__init__.py
+++ b/llama_stack/providers/inline/agents/meta_reference/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
+from typing import Any, Dict
 
-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api
 
 from .config import MetaReferenceAgentsImplConfig
 
 
-async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: Dict[Api, Any]):
     from .agents import MetaReferenceAgentsImpl
 
     impl = MetaReferenceAgentsImpl(
diff --git a/llama_stack/providers/inline/datasetio/localfs/__init__.py b/llama_stack/providers/inline/datasetio/localfs/__init__.py
index db8aa555c..5a0876d79 100644
--- a/llama_stack/providers/inline/datasetio/localfs/__init__.py
+++ b/llama_stack/providers/inline/datasetio/localfs/__init__.py
@@ -4,12 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from .config import LocalFSDatasetIOConfig
 
 
 async def get_provider_impl(
     config: LocalFSDatasetIOConfig,
-    _deps,
+    _deps: Dict[str, Any],
 ):
     from .datasetio import LocalFSDatasetIOImpl
 
diff --git a/llama_stack/providers/inline/eval/meta_reference/__init__.py b/llama_stack/providers/inline/eval/meta_reference/__init__.py
index 56c115322..e2a7fc2cd 100644
--- a/llama_stack/providers/inline/eval/meta_reference/__init__.py
+++ b/llama_stack/providers/inline/eval/meta_reference/__init__.py
@@ -3,16 +3,16 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict
 
-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api
 
 from .config import MetaReferenceEvalConfig
 
 
 async def get_provider_impl(
     config: MetaReferenceEvalConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
     from .eval import MetaReferenceEvalImpl
 
diff --git a/llama_stack/providers/inline/inference/meta_reference/__init__.py b/llama_stack/providers/inline/inference/meta_reference/__init__.py
index 9c923490d..3ef7cfd45 100644
--- a/llama_stack/providers/inline/inference/meta_reference/__init__.py
+++ b/llama_stack/providers/inline/inference/meta_reference/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Union
+from typing import Any, Dict, Union
 
 from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
 
 
 async def get_provider_impl(
     config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig],
-    _deps,
+    _deps: Dict[str, Any],
 ):
     from .inference import MetaReferenceInferenceImpl
 
diff --git a/llama_stack/providers/inline/inference/sentence_transformers/__init__.py b/llama_stack/providers/inline/inference/sentence_transformers/__init__.py
index d5710f7fd..c1d65d10c 100644
--- a/llama_stack/providers/inline/inference/sentence_transformers/__init__.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/__init__.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from llama_stack.providers.inline.inference.sentence_transformers.config import (
     SentenceTransformersInferenceConfig,
 )
@@ -11,7 +13,7 @@ from llama_stack.providers.inline.inference.sentence_transformers.config import
 
 async def get_provider_impl(
     config: SentenceTransformersInferenceConfig,
-    _deps,
+    _deps: Dict[str, Any],
 ):
     from .sentence_transformers import SentenceTransformersInferenceImpl
 
diff --git a/llama_stack/providers/inline/inference/vllm/__init__.py b/llama_stack/providers/inline/inference/vllm/__init__.py
index aa0c4b101..bd0551e57 100644
--- a/llama_stack/providers/inline/inference/vllm/__init__.py
+++ b/llama_stack/providers/inline/inference/vllm/__init__.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Any
+from typing import Any, Dict
 
 from .config import VLLMConfig
 
 
-async def get_provider_impl(config: VLLMConfig, _deps) -> Any:
+async def get_provider_impl(config: VLLMConfig, _deps: Dict[str, Any]):
     from .vllm import VLLMInferenceImpl
 
     impl = VLLMInferenceImpl(config)
diff --git a/llama_stack/providers/inline/post_training/torchtune/__init__.py b/llama_stack/providers/inline/post_training/torchtune/__init__.py
index 7ef8eee01..ca7801be7 100644
--- a/llama_stack/providers/inline/post_training/torchtune/__init__.py
+++ b/llama_stack/providers/inline/post_training/torchtune/__init__.py
@@ -4,9 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
+from typing import Any, Dict
 
-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api
 
 from .config import TorchtunePostTrainingConfig
 
@@ -15,7 +15,7 @@ from .config import TorchtunePostTrainingConfig
 
 async def get_provider_impl(
     config: TorchtunePostTrainingConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
     from .post_training import TorchtunePostTrainingImpl
 
diff --git a/llama_stack/providers/inline/safety/code_scanner/__init__.py b/llama_stack/providers/inline/safety/code_scanner/__init__.py
index 031130cb7..62975a963 100644
--- a/llama_stack/providers/inline/safety/code_scanner/__init__.py
+++ b/llama_stack/providers/inline/safety/code_scanner/__init__.py
@@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from .config import CodeScannerConfig
 
 
-async def get_provider_impl(config: CodeScannerConfig, deps):
+async def get_provider_impl(config: CodeScannerConfig, deps: Dict[str, Any]):
     from .code_scanner import MetaReferenceCodeScannerSafetyImpl
 
     impl = MetaReferenceCodeScannerSafetyImpl(config, deps)
diff --git a/llama_stack/providers/inline/safety/llama_guard/__init__.py b/llama_stack/providers/inline/safety/llama_guard/__init__.py
index ee9ee31e6..a4263b169 100644
--- a/llama_stack/providers/inline/safety/llama_guard/__init__.py
+++ b/llama_stack/providers/inline/safety/llama_guard/__init__.py
@@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from .config import LlamaGuardConfig
 
 
-async def get_provider_impl(config: LlamaGuardConfig, deps):
+async def get_provider_impl(config: LlamaGuardConfig, deps: Dict[str, Any]):
     from .llama_guard import LlamaGuardSafetyImpl
 
     assert isinstance(config, LlamaGuardConfig), f"Unexpected config type: {type(config)}"
diff --git a/llama_stack/providers/inline/safety/prompt_guard/__init__.py b/llama_stack/providers/inline/safety/prompt_guard/__init__.py
index 087aca6d9..747f34421 100644
--- a/llama_stack/providers/inline/safety/prompt_guard/__init__.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/__init__.py
@@ -4,10 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from .config import PromptGuardConfig  # noqa: F401
 
 
-async def get_provider_impl(config: PromptGuardConfig, deps):
+async def get_provider_impl(config: PromptGuardConfig, deps: Dict[str, Any]):
     from .prompt_guard import PromptGuardSafetyImpl
 
     impl = PromptGuardSafetyImpl(config, deps)
diff --git a/llama_stack/providers/inline/scoring/basic/__init__.py b/llama_stack/providers/inline/scoring/basic/__init__.py
index c72434e9e..4898b973a 100644
--- a/llama_stack/providers/inline/scoring/basic/__init__.py
+++ b/llama_stack/providers/inline/scoring/basic/__init__.py
@@ -3,16 +3,16 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict
 
-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api
 
 from .config import BasicScoringConfig
 
 
 async def get_provider_impl(
     config: BasicScoringConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
     from .scoring import BasicScoringImpl
 
diff --git a/llama_stack/providers/inline/scoring/braintrust/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py
index 2ddc58bd2..f1b0112d9 100644
--- a/llama_stack/providers/inline/scoring/braintrust/__init__.py
+++ b/llama_stack/providers/inline/scoring/braintrust/__init__.py
@@ -3,11 +3,11 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict
 
 from pydantic import BaseModel
 
-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api
 
 from .config import BraintrustScoringConfig
 
@@ -18,7 +18,7 @@ class BraintrustProviderDataValidator(BaseModel):
 
 async def get_provider_impl(
     config: BraintrustScoringConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
     from .braintrust import BraintrustScoringImpl
 
diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py b/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
index 18535332e..4a83bfe13 100644
--- a/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
@@ -3,16 +3,16 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Dict
+from typing import Any, Dict
 
-from llama_stack.distribution.datatypes import Api, ProviderSpec
+from llama_stack.distribution.datatypes import Api
 
 from .config import LlmAsJudgeScoringConfig
 
 
 async def get_provider_impl(
     config: LlmAsJudgeScoringConfig,
-    deps: Dict[Api, ProviderSpec],
+    deps: Dict[Api, Any],
 ):
     from .scoring import LlmAsJudgeScoringImpl
 
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py
index 995358d46..8317ce793 100644
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py
@@ -4,12 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from .config import CodeInterpreterToolConfig
 
 __all__ = ["CodeInterpreterToolConfig", "CodeInterpreterToolRuntimeImpl"]
 
 
-async def get_provider_impl(config: CodeInterpreterToolConfig, _deps):
+async def get_provider_impl(config: CodeInterpreterToolConfig, _deps: Dict[str, Any]):
     from .code_interpreter import CodeInterpreterToolRuntimeImpl
 
     impl = CodeInterpreterToolRuntimeImpl(config)
diff --git a/llama_stack/providers/inline/vector_io/chroma/__init__.py b/llama_stack/providers/inline/vector_io/chroma/__init__.py
index abaf01097..f39188b46 100644
--- a/llama_stack/providers/inline/vector_io/chroma/__init__.py
+++ b/llama_stack/providers/inline/vector_io/chroma/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
+from typing import Any, Dict
 
-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.datatypes import Api
 
 from .config import ChromaVectorIOConfig
 
 
-async def get_provider_impl(config: ChromaVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: ChromaVectorIOConfig, deps: Dict[Api, Any]):
     from llama_stack.providers.remote.vector_io.chroma.chroma import (
         ChromaVectorIOAdapter,
     )
diff --git a/llama_stack/providers/inline/vector_io/faiss/__init__.py b/llama_stack/providers/inline/vector_io/faiss/__init__.py
index f23e1fa4f..fc8ce70b4 100644
--- a/llama_stack/providers/inline/vector_io/faiss/__init__.py
+++ b/llama_stack/providers/inline/vector_io/faiss/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
+from typing import Any, Dict
 
-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.datatypes import Api
 
 from .config import FaissVectorIOConfig
 
 
-async def get_provider_impl(config: FaissVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: FaissVectorIOConfig, deps: Dict[Api, Any]):
     from .faiss import FaissVectorIOAdapter
 
     assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}"
diff --git a/llama_stack/providers/inline/vector_io/milvus/__init__.py b/llama_stack/providers/inline/vector_io/milvus/__init__.py
index bee6b2ded..d88a3b005 100644
--- a/llama_stack/providers/inline/vector_io/milvus/__init__.py
+++ b/llama_stack/providers/inline/vector_io/milvus/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
+from typing import Any, Dict
 
-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.datatypes import Api
 
 from .config import MilvusVectorIOConfig
 
 
-async def get_provider_impl(config: MilvusVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: MilvusVectorIOConfig, deps: Dict[Api, Any]):
     from llama_stack.providers.remote.vector_io.milvus.milvus import MilvusVectorIOAdapter
 
     impl = MilvusVectorIOAdapter(config, deps[Api.inference])
diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py b/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py
index 5a2f07012..2380eb0ef 100644
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Dict
+from typing import Any, Dict
 
-from llama_stack.providers.datatypes import Api, ProviderSpec
+from llama_stack.providers.datatypes import Api
 
 from .config import SQLiteVectorIOConfig
 
 
-async def get_provider_impl(config: SQLiteVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_provider_impl(config: SQLiteVectorIOConfig, deps: Dict[Api, Any]):
     from .sqlite_vec import SQLiteVecVectorIOAdapter
 
     assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}"

From d33b8ea3dc652fdb1c6a9c94e42c5e2dfe36eb7f Mon Sep 17 00:00:00 2001
From: Kelly Brown <86735520+kelbrown20@users.noreply.github.com>
Date: Tue, 11 Mar 2025 13:12:18 -0400
Subject: [PATCH 05/11] docs: Small nits in llama CLI reference (#1542)

**Description:** Fixes some small nits in the llama CLI reference
Note: There are a few nits in this PR, but also has some small
suggestions, feel free to close if not necessary
---
 .../references/llama_cli_reference/index.md       | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/docs/source/references/llama_cli_reference/index.md b/docs/source/references/llama_cli_reference/index.md
index 8a38fc3ae..7b7abdf88 100644
--- a/docs/source/references/llama_cli_reference/index.md
+++ b/docs/source/references/llama_cli_reference/index.md
@@ -1,6 +1,6 @@
 # llama (server-side) CLI Reference
 
-The `llama` CLI tool helps you setup and use the Llama Stack. It should be available on your path after installing the `llama-stack` package.
+The `llama` CLI tool helps you set up and use the Llama Stack. The CLI is available on your path after installing the `llama-stack` package.
 
 ## Installation
 
@@ -27,9 +27,9 @@ You have two ways to install Llama Stack:
 
 
 ## `llama` subcommands
-1. `download`: `llama` cli tools supports downloading the model from Meta or Hugging Face.
-2. `model`: Lists available models and their properties.
-3. `stack`: Allows you to build and run a Llama Stack server. You can read more about this [here](../../distributions/building_distro).
+1. `download`: Supports downloading models from Meta or Hugging Face.  [Downloading models](#downloading-models)
+2. `model`: Lists available models and their properties. [Understanding models](#understand-the-models)
+3. `stack`: Allows you to build a stack using the `llama stack` distribution and run a Llama Stack server. You can read more about how to build a Llama Stack distribution in the [Build your own Distribution](../../distributions/building_distro) documentation.
 
 ### Sample Usage
 
@@ -117,7 +117,7 @@ You should see a table like this:
 +----------------------------------+------------------------------------------+----------------+
 ```
 
-To download models, you can use the llama download command.
+To download models, you can use the `llama download` command.
 
 ### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
 
@@ -191,7 +191,7 @@ You should see a table like this:
 The `llama model` command helps you explore the model’s interface.
 
 1. `download`: Download the model from different sources. (meta, huggingface)
-2. `list`: Lists all the models available for download with hardware requirements to deploy the models.
+2. `list`: Lists all the models available for download with hardware requirements for deploying the models.
 3. `prompt-format`: Show llama model message formats.
 4. `describe`: Describes all the properties of the model.
 
@@ -262,13 +262,12 @@ llama model prompt-format -m Llama3.2-3B-Instruct
 ![alt text](../../../resources/prompt-format.png)
 
 
-
 You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios.
 
 **NOTE**: Outputs in terminal are color printed to show special tokens.
 
 ### Remove model
-You can run `llama model remove` to remove unecessary model:
+You can run `llama model remove` to remove an unnecessary model:
 
 ```
 llama model remove -m Llama-Guard-3-8B-int8

From aca82df7edfbbedfafd0f0db354ee4161e959fed Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Tue, 11 Mar 2025 13:30:55 -0400
Subject: [PATCH 06/11] fix: Multiple fixes for server shutdown (fix lifespan
 handling; fix handling CancelledError when raised by provider; let uvicorn
 handle signals) (#1495)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

If implementation raises CancelledError (e.g. when it runs its own async
loop for jobs), the main server shutdown handler gets confused and
doesn't attempt to shut down the main loop tasks.

While at it, also fixing the following failure when this happens:

```
UnboundLocalError: cannot access local variable 'loop' where it is not
associated with a value
```

Shutdown handlers were not running because lifespan logic was broken
since ~Oct 2024. Fixed that too and enforcing `lifespan` now (making
sure server will crash when it fails to interact with app through
middleware).

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Spotted while working on
https://github.com/meta-llama/llama-stack/pull/1437

One way to trigger it without the PR above is to add `raise
CancelledError` in
any of the running providers' `shutdown` methods; then `kill -INT <pid>`
the
server process.

Validated this with the following test patch:

```
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index b85c463a..10dad83e 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -174,6 +174,7 @@ def handle_signal(app, signum, _) -> None:
         except asyncio.CancelledError:
             pass
         finally:
+            logger.info("Stopping event loop")
             loop.stop()

     loop = asyncio.get_running_loop()
diff --git a/llama_stack/providers/inline/post_training/torchtune/post_training.py b/llama_stack/providers/inline/post_training/torchtune/post_training.py
index b837362d..163f43d8 100644
--- a/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
 from datetime import datetime
 from typing import Any, Dict, Optional

@@ -43,6 +44,9 @@ class TorchtunePostTrainingImpl:
         self.jobs = {}
         self.checkpoints_dict = {}

+    async def shutdown(self) -> None:
+        raise asyncio.CancelledError("Shutdown")
+
     async def supervised_fine_tune(
         self,
         job_uuid: str,
```

Without the fix:

```
INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Finished server process [52099]
INFO     2025-03-07 23:25:33,548 __main__:143 server: Received signal SIGINT (2). Exiting gracefully...
INFO     2025-03-07 23:25:33,550 __main__:150 server: Shutting down DatasetsRoutingTable
INFO     2025-03-07 23:25:33,551 __main__:177 server: Stopping event loop
ERROR    2025-03-07 23:25:33,552 asyncio:1785 uncategorized: unhandled exception during asyncio.run() shutdown
         task: <Task finished name='Task-12' coro=<handle_signal.<locals>.shutdown() done, defined at
         /home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py:145>
         exception=UnboundLocalError("cannot access local variable 'loop' where it is not associated with a value")>
         ╭───────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────╮
         │ /home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py:178 in shutdown           │
         │                                                                                                             │
         │   175 │   │   │   pass                                                                                      │
         │   176 │   │   finally:                                                                                      │
         │   177 │   │   │   logger.info("Stopping event loop")                                                        │
         │ ❱ 178 │   │   │   loop.stop()                                                                               │
         │   179 │                                                                                                     │
         │   180 │   loop = asyncio.get_running_loop()                                                                 │
         │   181 │   loop.create_task(shutdown())                                                                      │
         ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
         UnboundLocalError: cannot access local variable 'loop' where it is not associated with a value

```

With the fix, now seeing the following messages when the server is
killed:

```
INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Finished server process [50836]
INFO     2025-03-07 23:20:35,182 __main__:143 server: Received signal SIGINT (2). Exiting gracefully...
INFO     2025-03-07 23:20:35,184 __main__:149 server: Shutting down DatasetsRoutingTable
ERROR    2025-03-07 23:20:35,185 __main__:158 server: Failed to shutdown DatasetsRoutingTable: {CancelledError()}
         ╭───────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────╮
         │ /usr/lib64/python3.11/asyncio/tasks.py:476 in wait_for                                                      │
         │                                                                                                             │
         │   473 │   try:                                                                                              │
         │   474 │   │   # wait until the future completes or the timeout                                              │
         │   475 │   │   try:                                                                                          │
         │ ❱ 476 │   │   │   await waiter                                                                              │
         │   477 │   │   except exceptions.CancelledError:                                                             │
         │   478 │   │   │   if fut.done():                                                                            │
         │   479 │   │   │   │   return fut.result()                                                                   │
         ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
         CancelledError

         During handling of the above exception, another exception occurred:

         ╭───────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────╮
         │ /home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py:152 in shutdown           │
         │                                                                                                             │
         │   149 │   │   │   logger.info("Shutting down %s", impl_name)                                                │
         │   150 │   │   │   try:                                                                                      │
         │   151 │   │   │   │   if hasattr(impl, "shutdown"):                                                         │
         │ ❱ 152 │   │   │   │   │   await asyncio.wait_for(impl.shutdown(), timeout=5)                                │
         │   153 │   │   │   │   else:                                                                                 │
         │   154 │   │   │   │   │   logger.warning("No shutdown method for %s", impl_name)                            │
         │   155 │   │   │   except asyncio.TimeoutError:                                                              │
         │                                                                                                             │
         │ /usr/lib64/python3.11/asyncio/tasks.py:479 in wait_for                                                      │
         │                                                                                                             │
         │   476 │   │   │   await waiter                                                                              │
         │   477 │   │   except exceptions.CancelledError:                                                             │
         │   478 │   │   │   if fut.done():                                                                            │
         │ ❱ 479 │   │   │   │   return fut.result()                                                                   │
         │   480 │   │   │   else:                                                                                     │
         │   481 │   │   │   │   fut.remove_done_callback(cb)                                                          │
         │   482 │   │   │   │   # We must ensure that the task is not running                                         │
         │                                                                                                             │
         │ /home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/routers/routing_tables.py:131 in shutdown  │
         │                                                                                                             │
         │   128 │   │   │   elif api == Api.tool_runtime:                                                             │
         │   129 │   │   │   │   p.tool_store = self                                                                   │
         │   130 │                                                                                                     │
         │ ❱ 131 │   async def shutdown(self) -> None:                                                                 │
         │   132 │   │   for p in self.impls_by_provider_id.values():                                                  │
         │   133 │   │   │   await p.shutdown()                                                                        │
         │   134                                                                                                       │
         ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
         CancelledError
INFO     2025-03-07 23:20:35,295 __main__:149 server: Shutting down DatasetIORouter
INFO     2025-03-07 23:20:35,296 __main__:149 server: Shutting down ScoringFunctionsRoutingTable
INFO     2025-03-07 23:20:35,297 __main__:149 server: Shutting down ScoringRouter
INFO     2025-03-07 23:20:35,298 __main__:149 server: Shutting down ModelsRoutingTable
INFO     2025-03-07 23:20:35,299 __main__:149 server: Shutting down InferenceRouter
INFO     2025-03-07 23:20:35,300 __main__:149 server: Shutting down ShieldsRoutingTable
INFO     2025-03-07 23:20:35,300 __main__:149 server: Shutting down SafetyRouter
INFO     2025-03-07 23:20:35,301 __main__:149 server: Shutting down VectorDBsRoutingTable
INFO     2025-03-07 23:20:35,302 __main__:149 server: Shutting down VectorIORouter
INFO     2025-03-07 23:20:35,303 __main__:149 server: Shutting down ToolGroupsRoutingTable
INFO     2025-03-07 23:20:35,304 __main__:149 server: Shutting down ToolRuntimeRouter
INFO     2025-03-07 23:20:35,304 __main__:149 server: Shutting down MetaReferenceAgentsImpl
INFO     2025-03-07 23:20:35,305 __main__:149 server: Shutting down TelemetryAdapter
INFO     2025-03-07 23:20:35,306 __main__:149 server: Shutting down TorchtunePostTrainingImpl
ERROR    2025-03-07 23:20:35,307 __main__:158 server: Failed to shutdown TorchtunePostTrainingImpl:
         {CancelledError('Shutdown')}
         ╭───────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────╮
         │ /home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py:152 in shutdown           │
         │                                                                                                             │
         │   149 │   │   │   logger.info("Shutting down %s", impl_name)                                                │
         │   150 │   │   │   try:                                                                                      │
         │   151 │   │   │   │   if hasattr(impl, "shutdown"):                                                         │
         │ ❱ 152 │   │   │   │   │   await asyncio.wait_for(impl.shutdown(), timeout=5)                                │
         │   153 │   │   │   │   else:                                                                                 │
         │   154 │   │   │   │   │   logger.warning("No shutdown method for %s", impl_name)                            │
         │   155 │   │   │   except asyncio.TimeoutError:                                                              │
         │                                                                                                             │
         │ /usr/lib64/python3.11/asyncio/tasks.py:489 in wait_for                                                      │
         │                                                                                                             │
         │   486 │   │   │   │   raise                                                                                 │
         │   487 │   │                                                                                                 │
         │   488 │   │   if fut.done():                                                                                │
         │ ❱ 489 │   │   │   return fut.result()                                                                       │
         │   490 │   │   else:                                                                                         │
         │   491 │   │   │   fut.remove_done_callback(cb)                                                              │
         │   492 │   │   │   # We must ensure that the task is not running                                             │
         │                                                                                                             │
         │ /home/ec2-user/src/llama-stack/schedule/llama_stack/providers/inline/post_training/torchtune/post_training. │
         │ py:48 in shutdown                                                                                           │
         │                                                                                                             │
         │    45 │   │   self.checkpoints_dict = {}                                                                    │
         │    46 │                                                                                                     │
         │    47 │   async def shutdown(self) -> None:                                                                 │
         │ ❱  48 │   │   raise asyncio.CancelledError("Shutdown")                                                      │
         │    49 │                                                                                                     │
         │    50 │   async def supervised_fine_tune(                                                                   │
         │    51 │   │   self,                                                                                         │
         ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
         CancelledError: Shutdown
INFO     2025-03-07 23:20:35,352 __main__:149 server: Shutting down BenchmarksRoutingTable
INFO     2025-03-07 23:20:35,353 __main__:149 server: Shutting down EvalRouter
INFO     2025-03-07 23:20:35,354 __main__:149 server: Shutting down DistributionInspectImpl
INFO     2025-03-07 23:20:35,355 __main__:177 server: Stopping event loop
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py", line 488, in <module>
    main()
  File "/home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py", line 476, in main
    uvicorn.run(**uvicorn_config)
  File "/home/ec2-user/src/llama-stack/schedule/venv/lib64/python3.11/site-packages/uvicorn/main.py", line 579, in run
    server.run()
  File "/home/ec2-user/src/llama-stack/schedule/venv/lib64/python3.11/site-packages/uvicorn/server.py", line 66, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib64/python3.11/asyncio/runners.py", line 189, in run
    with Runner(debug=debug) as runner:
  File "/usr/lib64/python3.11/asyncio/runners.py", line 63, in __exit__
    self.close()
  File "/usr/lib64/python3.11/asyncio/runners.py", line 71, in close
    _cancel_all_tasks(loop)
  File "/usr/lib64/python3.11/asyncio/runners.py", line 201, in _cancel_all_tasks
    loop.run_until_complete(tasks.gather(*to_cancel, return_exceptions=True))
  File "/usr/lib64/python3.11/asyncio/base_events.py", line 652, in run_until_complete
    raise RuntimeError('Event loop stopped before Future completed.')
RuntimeError: Event loop stopped before Future completed.
++ error_handler 104
++ echo 'Error occurred in script at line: 104'
Error occurred in script at line: 104
++ exit 1
```

With all patches included, the shutdown now looks as follows:

```
$ kill -INT $(ps ax | grep  llama_stack.distribution.server.server | grep -v nvim | awk -e '{print $1}' | sort | head -n 1)
```

```
20:56:09.308 [START]
INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO     2025-03-10 20:56:43,961 __main__:140 server: Shutting down
INFO     2025-03-10 20:56:43,962 __main__:124 server: Shutting down DatasetsRoutingTable
INFO     2025-03-10 20:56:43,964 __main__:124 server: Shutting down DatasetIORouter
INFO     2025-03-10 20:56:43,965 __main__:124 server: Shutting down ScoringFunctionsRoutingTable
INFO     2025-03-10 20:56:43,966 __main__:124 server: Shutting down ScoringRouter
INFO     2025-03-10 20:56:43,967 __main__:124 server: Shutting down ModelsRoutingTable
INFO     2025-03-10 20:56:43,968 __main__:124 server: Shutting down InferenceRouter
INFO     2025-03-10 20:56:43,969 __main__:124 server: Shutting down ShieldsRoutingTable
INFO     2025-03-10 20:56:43,971 __main__:124 server: Shutting down SafetyRouter
INFO     2025-03-10 20:56:43,972 __main__:124 server: Shutting down VectorDBsRoutingTable
INFO     2025-03-10 20:56:43,973 __main__:124 server: Shutting down VectorIORouter
INFO     2025-03-10 20:56:43,974 __main__:124 server: Shutting down ToolGroupsRoutingTable
INFO     2025-03-10 20:56:43,975 __main__:124 server: Shutting down ToolRuntimeRouter
INFO     2025-03-10 20:56:43,976 __main__:124 server: Shutting down MetaReferenceAgentsImpl
INFO     2025-03-10 20:56:43,977 __main__:124 server: Shutting down TelemetryAdapter
INFO     2025-03-10 20:56:43,978 __main__:124 server: Shutting down TorchtunePostTrainingImpl
WARNING  2025-03-10 20:56:43,979 __main__:129 server: No shutdown method for TorchtunePostTrainingImpl
INFO     2025-03-10 20:56:43,979 __main__:124 server: Shutting down BenchmarksRoutingTable
INFO     2025-03-10 20:56:43,980 __main__:124 server: Shutting down EvalRouter
INFO     2025-03-10 20:56:43,981 __main__:124 server: Shutting down DistributionInspectImpl
INFO:     Application shutdown complete.
INFO:     Finished server process [33862]
```

[//]: # (## Documentation)

---------

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 llama_stack/distribution/server/server.py | 87 +++++------------------
 1 file changed, 19 insertions(+), 68 deletions(-)

diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index f819d446f..ea8723365 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -6,11 +6,9 @@
 
 import argparse
 import asyncio
-import functools
 import inspect
 import json
 import os
-import signal
 import sys
 import traceback
 import warnings
@@ -118,69 +116,24 @@ def translate_exception(exc: Exception) -> Union[HTTPException, RequestValidatio
         )
 
 
-def handle_signal(app, signum, _) -> None:
+async def shutdown(app):
+    """Initiate a graceful shutdown of the application.
+
+    Handled by the lifespan context manager. The shutdown process involves
+    shutting down all implementations registered in the application.
     """
-    Handle incoming signals and initiate a graceful shutdown of the application.
-
-    This function is intended to be used as a signal handler for various signals
-    (e.g., SIGINT, SIGTERM). Upon receiving a signal, it will print a message
-    indicating the received signal and initiate a shutdown process.
-
-    Args:
-        app: The application instance containing implementations to be shut down.
-        signum (int): The signal number received.
-        frame: The current stack frame (not used in this function).
-
-    The shutdown process involves:
-        - Shutting down all implementations registered in the application.
-        - Gathering all running asyncio tasks.
-        - Cancelling all gathered tasks.
-        - Waiting for all tasks to finish.
-        - Stopping the event loop.
-
-    Note:
-        This function schedules the shutdown process as an asyncio task and does
-        not block the current execution.
-    """
-    signame = signal.Signals(signum).name
-    logger.info(f"Received signal {signame} ({signum}). Exiting gracefully...")
-
-    async def shutdown():
+    for impl in app.__llama_stack_impls__.values():
+        impl_name = impl.__class__.__name__
+        logger.info("Shutting down %s", impl_name)
         try:
-            # Gracefully shut down implementations
-            for impl in app.__llama_stack_impls__.values():
-                impl_name = impl.__class__.__name__
-                logger.info("Shutting down %s", impl_name)
-                try:
-                    if hasattr(impl, "shutdown"):
-                        await asyncio.wait_for(impl.shutdown(), timeout=5)
-                    else:
-                        logger.warning("No shutdown method for %s", impl_name)
-                except asyncio.TimeoutError:
-                    logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True)
-                except Exception as e:
-                    logger.exception("Failed to shutdown %s: %s", impl_name, {e})
-
-            # Gather all running tasks
-            loop = asyncio.get_running_loop()
-            tasks = [task for task in asyncio.all_tasks(loop) if task is not asyncio.current_task()]
-
-            # Cancel all tasks
-            for task in tasks:
-                task.cancel()
-
-            # Wait for all tasks to finish
-            try:
-                await asyncio.wait_for(asyncio.gather(*tasks, return_exceptions=True), timeout=10)
-            except asyncio.TimeoutError:
-                logger.exception("Timeout while waiting for tasks to finish")
-        except asyncio.CancelledError:
-            pass
-        finally:
-            loop.stop()
-
-    loop = asyncio.get_running_loop()
-    loop.create_task(shutdown())
+            if hasattr(impl, "shutdown"):
+                await asyncio.wait_for(impl.shutdown(), timeout=5)
+            else:
+                logger.warning("No shutdown method for %s", impl_name)
+        except asyncio.TimeoutError:
+            logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True)
+        except (Exception, asyncio.CancelledError) as e:
+            logger.exception("Failed to shutdown %s: %s", impl_name, {e})
 
 
 @asynccontextmanager
@@ -188,8 +141,7 @@ async def lifespan(app: FastAPI):
     logger.info("Starting up")
     yield
     logger.info("Shutting down")
-    for impl in app.__llama_stack_impls__.values():
-        await impl.shutdown()
+    await shutdown(app)
 
 
 def is_streaming_request(func_name: str, request: Request, **kwargs):
@@ -266,7 +218,7 @@ class TracingMiddleware:
         self.app = app
 
     async def __call__(self, scope, receive, send):
-        path = scope["path"]
+        path = scope.get("path", "")
         await start_trace(path, {"__location__": "server"})
         try:
             return await self.app(scope, receive, send)
@@ -439,8 +391,6 @@ def main():
 
     app.exception_handler(RequestValidationError)(global_exception_handler)
     app.exception_handler(Exception)(global_exception_handler)
-    signal.signal(signal.SIGINT, functools.partial(handle_signal, app))
-    signal.signal(signal.SIGTERM, functools.partial(handle_signal, app))
 
     app.__llama_stack_impls__ = impls
 
@@ -471,6 +421,7 @@ def main():
         "app": app,
         "host": listen_host,
         "port": port,
+        "lifespan": "on",
     }
     if ssl_config:
         uvicorn_config.update(ssl_config)

From 83a2c78615a3b4a2ad96852023b0292a401a0463 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Tue, 11 Mar 2025 18:33:46 +0100
Subject: [PATCH 07/11] feat(api): list agents / sessions and get agent (#1410)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Add support for listing agents, describing an agent, and retrieving
session IDs for a given agent. This is only the API definition, the
implementations will come separately.

Closes: https://github.com/meta-llama/llama-stack/issues/1294

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 docs/_static/llama-stack-spec.html | 169 +++++++++++++++++++++++++++++
 docs/_static/llama-stack-spec.yaml | 118 ++++++++++++++++++++
 llama_stack/apis/agents/agents.py  |  46 ++++++++
 3 files changed, 333 insertions(+)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 1a8169090..b0febbbef 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -363,6 +363,37 @@
             }
         },
         "/v1/agents": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "A ListAgentsResponse.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListAgentsResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Agents"
+                ],
+                "description": "List all agents.",
+                "parameters": []
+            },
             "post": {
                 "responses": {
                     "200": {
@@ -609,6 +640,47 @@
             }
         },
         "/v1/agents/{agent_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "An Agent of the agent.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Agent"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Agents"
+                ],
+                "description": "Describe an agent by its ID.",
+                "parameters": [
+                    {
+                        "name": "agent_id",
+                        "in": "path",
+                        "description": "ID of the agent.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            },
             "delete": {
                 "responses": {
                     "200": {
@@ -2276,6 +2348,49 @@
                 ]
             }
         },
+        "/v1/agents/{agent_id}/sessions": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "A ListAgentSessionsResponse.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListAgentSessionsResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Agents"
+                ],
+                "description": "List all session(s) of a given agent.",
+                "parameters": [
+                    {
+                        "name": "agent_id",
+                        "in": "path",
+                        "description": "The ID of the agent to list sessions for.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/eval/benchmarks": {
             "get": {
                 "responses": {
@@ -6565,6 +6680,28 @@
                 "title": "ScoringResult",
                 "description": "A scoring result for a single row."
             },
+            "Agent": {
+                "type": "object",
+                "properties": {
+                    "agent_id": {
+                        "type": "string"
+                    },
+                    "agent_config": {
+                        "$ref": "#/components/schemas/AgentConfig"
+                    },
+                    "created_at": {
+                        "type": "string",
+                        "format": "date-time"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "agent_id",
+                    "agent_config",
+                    "created_at"
+                ],
+                "title": "Agent"
+            },
             "Session": {
                 "type": "object",
                 "properties": {
@@ -7907,6 +8044,38 @@
                 ],
                 "title": "ToolInvocationResult"
             },
+            "ListAgentSessionsResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/Session"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ],
+                "title": "ListAgentSessionsResponse"
+            },
+            "ListAgentsResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/Agent"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ],
+                "title": "ListAgentsResponse"
+            },
             "BucketResponse": {
                 "type": "object",
                 "properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index d6001c00d..2985e6222 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -238,6 +238,28 @@ paths:
               $ref: '#/components/schemas/CompletionRequest'
         required: true
   /v1/agents:
+    get:
+      responses:
+        '200':
+          description: A ListAgentsResponse.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListAgentsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Agents
+      description: List all agents.
+      parameters: []
     post:
       responses:
         '200':
@@ -410,6 +432,34 @@ paths:
               $ref: '#/components/schemas/CreateUploadSessionRequest'
         required: true
   /v1/agents/{agent_id}:
+    get:
+      responses:
+        '200':
+          description: An Agent of the agent.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Agent'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Agents
+      description: Describe an agent by its ID.
+      parameters:
+        - name: agent_id
+          in: path
+          description: ID of the agent.
+          required: true
+          schema:
+            type: string
     delete:
       responses:
         '200':
@@ -1528,6 +1578,36 @@ paths:
           required: true
           schema:
             type: string
+  /v1/agents/{agent_id}/sessions:
+    get:
+      responses:
+        '200':
+          description: A ListAgentSessionsResponse.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListAgentSessionsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Agents
+      description: List all session(s) of a given agent.
+      parameters:
+        - name: agent_id
+          in: path
+          description: >-
+            The ID of the agent to list sessions for.
+          required: true
+          schema:
+            type: string
   /v1/eval/benchmarks:
     get:
       responses:
@@ -4549,6 +4629,22 @@ components:
         - aggregated_results
       title: ScoringResult
       description: A scoring result for a single row.
+    Agent:
+      type: object
+      properties:
+        agent_id:
+          type: string
+        agent_config:
+          $ref: '#/components/schemas/AgentConfig'
+        created_at:
+          type: string
+          format: date-time
+      additionalProperties: false
+      required:
+        - agent_id
+        - agent_config
+        - created_at
+      title: Agent
     Session:
       type: object
       properties:
@@ -5385,6 +5481,28 @@ components:
       required:
         - content
       title: ToolInvocationResult
+    ListAgentSessionsResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/Session'
+      additionalProperties: false
+      required:
+        - data
+      title: ListAgentSessionsResponse
+    ListAgentsResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/Agent'
+      additionalProperties: false
+      required:
+        - data
+      title: ListAgentsResponse
     BucketResponse:
       type: object
       properties:
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index af4b0ba77..1170a56d5 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -234,6 +234,23 @@ class AgentConfig(AgentConfigCommon):
     response_format: Optional[ResponseFormat] = None
 
 
+@json_schema_type
+class Agent(BaseModel):
+    agent_id: str
+    agent_config: AgentConfig
+    created_at: datetime
+
+
+@json_schema_type
+class ListAgentsResponse(BaseModel):
+    data: List[Agent]
+
+
+@json_schema_type
+class ListAgentSessionsResponse(BaseModel):
+    data: List[Session]
+
+
 class AgentConfigOverridablePerTurn(AgentConfigCommon):
     instructions: Optional[str] = None
 
@@ -541,3 +558,32 @@ class Agents(Protocol):
         :param agent_id: The ID of the agent to delete.
         """
         ...
+
+    @webmethod(route="/agents", method="GET")
+    async def list_agents(self) -> ListAgentsResponse:
+        """List all agents.
+
+        :returns: A ListAgentsResponse.
+        """
+        ...
+
+    @webmethod(route="/agents/{agent_id}", method="GET")
+    async def get_agent(self, agent_id: str) -> Agent:
+        """Describe an agent by its ID.
+
+        :param agent_id: ID of the agent.
+        :returns: An Agent of the agent.
+        """
+        ...
+
+    @webmethod(route="/agents/{agent_id}/sessions", method="GET")
+    async def list_agent_sessions(
+        self,
+        agent_id: str,
+    ) -> ListAgentSessionsResponse:
+        """List all session(s) of a given agent.
+
+        :param agent_id: The ID of the agent to list sessions for.
+        :returns: A ListAgentSessionsResponse.
+        """
+        ...

From b647ecd9ed9ecf433a6ce972a06e7a339fbf7ca6 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Tue, 11 Mar 2025 14:09:31 -0400
Subject: [PATCH 08/11] feat: add support for LLAMA_STACK_LOG_FILE (#1450)

# What does this PR do?

setting $LLAMA_STACK_LOG_FILE will pipe the logs to a file as well as
stdout. this is done by using a logging FileHandler

Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 docs/source/distributions/building_distro.md |  2 +
 llama_stack/log.py                           | 47 +++++++++++++-------
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index 942596b59..37a7e7974 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -33,6 +33,8 @@ Can be set to any of the following log levels:
 
 The default global log level is `info`. `all` sets the log level for all components.
 
+A user can also set `LLAMA_STACK_LOG_FILE` which will pipe the logs to the specified path as well as to the terminal. An example would be: `export LLAMA_STACK_LOG_FILE=server.log`
+
 ### Llama Stack Build
 
 In order to build your own distribution, we recommend you clone the `llama-stack` repository.
diff --git a/llama_stack/log.py b/llama_stack/log.py
index 9b9f5c5d8..80ee9fa1b 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -97,12 +97,13 @@ class CustomRichHandler(RichHandler):
                 self.markup = original_markup
 
 
-def setup_logging(category_levels: Dict[str, int]) -> None:
+def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None:
     """
-    Configure logging based on the provided category log levels.
+    Configure logging based on the provided category log levels and an optional log file.
 
     Parameters:
         category_levels (Dict[str, int]): A dictionary mapping categories to their log levels.
+        log_file (str): Path to a log file to additionally pipe the logs into
     """
     log_format = "[dim]%(asctime)s %(name)s:%(lineno)d[/] [yellow dim]%(category)s[/]: %(message)s"
 
@@ -117,6 +118,28 @@ def setup_logging(category_levels: Dict[str, int]) -> None:
     # Determine the root logger's level (default to WARNING if not specified)
     root_level = category_levels.get("root", logging.WARNING)
 
+    handlers = {
+        "console": {
+            "()": CustomRichHandler,  # Use custom console handler
+            "formatter": "rich",
+            "rich_tracebacks": True,
+            "show_time": False,
+            "show_path": False,
+            "markup": True,
+            "filters": ["category_filter"],
+        }
+    }
+
+    # Add a file handler if log_file is set
+    if log_file:
+        handlers["file"] = {
+            "class": "logging.FileHandler",
+            "formatter": "rich",
+            "filename": log_file,
+            "mode": "a",
+            "encoding": "utf-8",
+        }
+
     logging_config = {
         "version": 1,
         "disable_existing_loggers": False,
@@ -126,17 +149,7 @@ def setup_logging(category_levels: Dict[str, int]) -> None:
                 "format": log_format,
             }
         },
-        "handlers": {
-            "console": {
-                "()": CustomRichHandler,  # Use our custom handler class
-                "formatter": "rich",
-                "rich_tracebacks": True,
-                "show_time": False,
-                "show_path": False,
-                "markup": True,
-                "filters": ["category_filter"],
-            }
-        },
+        "handlers": handlers,
         "filters": {
             "category_filter": {
                 "()": CategoryFilter,
@@ -144,14 +157,14 @@ def setup_logging(category_levels: Dict[str, int]) -> None:
         },
         "loggers": {
             category: {
-                "handlers": ["console"],
+                "handlers": list(handlers.keys()),  # Apply all handlers
                 "level": category_levels.get(category, DEFAULT_LOG_LEVEL),
                 "propagate": False,  # Disable propagation to root logger
             }
             for category in CATEGORIES
         },
         "root": {
-            "handlers": ["console"],
+            "handlers": list(handlers.keys()),
             "level": root_level,  # Set root logger's level dynamically
         },
     }
@@ -180,4 +193,6 @@ if env_config:
     cprint(f"Environment variable LLAMA_STACK_LOGGING found: {env_config}", "yellow")
     _category_levels.update(parse_environment_config(env_config))
 
-setup_logging(_category_levels)
+log_file = os.environ.get("LLAMA_STACK_LOG_FILE")
+
+setup_logging(_category_levels, log_file)

From 275bab1373f13704edf3cc29a94dd37af6a5dced Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Tue, 11 Mar 2025 14:11:32 -0400
Subject: [PATCH 09/11] test: loosen Python 3.10 version for unit tests (#1547)

# What does this PR do?
as I brought up in #1515 it shouldn't be nessessary to tie the unit test
runner to an exact z-stream of Python 3.10

updated so unit test runner always uses latest z-stream of Python 3.10

## Test Plan
```shell
$ uv run -p 3.10 --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report.xml
```

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/unit-tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 48658047f..3acfabe70 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -14,16 +14,16 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.10.16'
+          python-version: '3.10'
 
       - uses: astral-sh/setup-uv@v5
         with:
-          python-version: '3.10.16'
+          python-version: '3.10'
           enable-cache: false
 
       - name: Run unit tests
         run: |
-          uv run -p 3.10.16 --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report.xml
+          uv run -p 3.10 --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report.xml
 
       - name: Upload test results
         if: always()

From 85501ed8758a7b511cf972dfcb4c685ee849e368 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Tue, 11 Mar 2025 11:19:29 -0700
Subject: [PATCH 10/11] fix: remove Llama-3.2-1B-Instruct for fireworks (#1558)

# What does this PR do?
remove Llama-3.2-1B-Instruct for fireworks as its no longer appears to
be hosted on website.


## Test Plan

python distro_codegen.py
---
 .../distributions/self_hosted_distro/fireworks.md      |  1 -
 .../providers/remote/inference/fireworks/models.py     |  4 ----
 llama_stack/templates/ci-tests/run.yaml                | 10 ----------
 llama_stack/templates/dev/run.yaml                     | 10 ----------
 llama_stack/templates/fireworks/run-with-safety.yaml   | 10 ----------
 llama_stack/templates/fireworks/run.yaml               | 10 ----------
 6 files changed, 45 deletions(-)

diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md
index 9592a18fe..3c8f5eec9 100644
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@@ -40,7 +40,6 @@ The following models are available by default:
 - `accounts/fireworks/models/llama-v3p1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
 - `accounts/fireworks/models/llama-v3p1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
 - `accounts/fireworks/models/llama-v3p1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
-- `accounts/fireworks/models/llama-v3p2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
 - `accounts/fireworks/models/llama-v3p2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
 - `accounts/fireworks/models/llama-v3p2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
 - `accounts/fireworks/models/llama-v3p2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
diff --git a/llama_stack/providers/remote/inference/fireworks/models.py b/llama_stack/providers/remote/inference/fireworks/models.py
index c90f632ff..a0dc11768 100644
--- a/llama_stack/providers/remote/inference/fireworks/models.py
+++ b/llama_stack/providers/remote/inference/fireworks/models.py
@@ -24,10 +24,6 @@ MODEL_ENTRIES = [
         "accounts/fireworks/models/llama-v3p1-405b-instruct",
         CoreModelId.llama3_1_405b_instruct.value,
     ),
-    build_hf_repo_model_entry(
-        "accounts/fireworks/models/llama-v3p2-1b-instruct",
-        CoreModelId.llama3_2_1b_instruct.value,
-    ),
     build_hf_repo_model_entry(
         "accounts/fireworks/models/llama-v3p2-3b-instruct",
         CoreModelId.llama3_2_3b_instruct.value,
diff --git a/llama_stack/templates/ci-tests/run.yaml b/llama_stack/templates/ci-tests/run.yaml
index 3a973cabf..715d7c86d 100644
--- a/llama_stack/templates/ci-tests/run.yaml
+++ b/llama_stack/templates/ci-tests/run.yaml
@@ -120,16 +120,6 @@ models:
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
   model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-1B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  model_type: llm
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
   provider_id: fireworks
diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/dev/run.yaml
index 71fbcb353..f908af8c3 100644
--- a/llama_stack/templates/dev/run.yaml
+++ b/llama_stack/templates/dev/run.yaml
@@ -178,16 +178,6 @@ models:
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
   model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-1B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  model_type: llm
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
   provider_id: fireworks
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index 359bf0194..e04141a07 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -132,16 +132,6 @@ models:
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
   model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-1B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  model_type: llm
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
   provider_id: fireworks
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index 0ce3a4505..369b9ae7b 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -126,16 +126,6 @@ models:
   provider_id: fireworks
   provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct
   model_type: llm
-- metadata: {}
-  model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-1B-Instruct
-  provider_id: fireworks
-  provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct
-  model_type: llm
 - metadata: {}
   model_id: accounts/fireworks/models/llama-v3p2-3b-instruct
   provider_id: fireworks

From 43044f29e2275bd6a15cd74b9cdb816f7049756f Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 11 Mar 2025 11:22:22 -0700
Subject: [PATCH 11/11] fix: fix llama stack run with missing agent impl
 (#1559)

# What does this PR do?

- recent merge https://github.com/meta-llama/llama-stack/pull/1410
introduce error
```
ValueError: Provider meta-reference (Api.agents) does not implement the following methods:
[('list_agent_sessions', 'not_actually_implemented'), ('list_agents', 'not_actually_implemented')]
```

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
llama stack run
```

```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/agents/test_agents.py --text-model meta-llama/Llama-3.3-70B-Instruct
```

https://github.com/meta-llama/llama-stack-ops/actions/runs/13795303869

[//]: # (## Documentation)
---
 .../inline/agents/meta_reference/agents.py    | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
index a46fa8eb7..c24b14e35 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -12,6 +12,7 @@ import uuid
 from typing import AsyncGenerator, List, Optional, Union
 
 from llama_stack.apis.agents import (
+    Agent,
     AgentConfig,
     AgentCreateResponse,
     Agents,
@@ -21,6 +22,8 @@ from llama_stack.apis.agents import (
     AgentTurnCreateRequest,
     AgentTurnResumeRequest,
     Document,
+    ListAgentSessionsResponse,
+    ListAgentsResponse,
     Session,
     Turn,
 )
@@ -84,7 +87,7 @@ class MetaReferenceAgentsImpl(Agents):
             agent_id=agent_id,
         )
 
-    async def get_agent(self, agent_id: str) -> ChatAgent:
+    async def _get_agent_impl(self, agent_id: str) -> ChatAgent:
         agent_config = await self.persistence_store.get(
             key=f"agent:{agent_id}",
         )
@@ -120,7 +123,7 @@ class MetaReferenceAgentsImpl(Agents):
         agent_id: str,
         session_name: str,
     ) -> AgentSessionCreateResponse:
-        agent = await self.get_agent(agent_id)
+        agent = await self._get_agent_impl(agent_id)
 
         session_id = await agent.create_session(session_name)
         return AgentSessionCreateResponse(
@@ -160,7 +163,7 @@ class MetaReferenceAgentsImpl(Agents):
         self,
         request: AgentTurnCreateRequest,
     ) -> AsyncGenerator:
-        agent = await self.get_agent(request.agent_id)
+        agent = await self._get_agent_impl(request.agent_id)
         async for event in agent.create_and_execute_turn(request):
             yield event
 
@@ -188,12 +191,12 @@ class MetaReferenceAgentsImpl(Agents):
         self,
         request: AgentTurnResumeRequest,
     ) -> AsyncGenerator:
-        agent = await self.get_agent(request.agent_id)
+        agent = await self._get_agent_impl(request.agent_id)
         async for event in agent.resume_turn(request):
             yield event
 
     async def get_agents_turn(self, agent_id: str, session_id: str, turn_id: str) -> Turn:
-        agent = await self.get_agent(agent_id)
+        agent = await self._get_agent_impl(agent_id)
         turn = await agent.storage.get_session_turn(session_id, turn_id)
         return turn
 
@@ -210,7 +213,7 @@ class MetaReferenceAgentsImpl(Agents):
         session_id: str,
         turn_ids: Optional[List[str]] = None,
     ) -> Session:
-        agent = await self.get_agent(agent_id)
+        agent = await self._get_agent_impl(agent_id)
         session_info = await agent.storage.get_session_info(session_id)
         if session_info is None:
             raise ValueError(f"Session {session_id} not found")
@@ -232,3 +235,15 @@ class MetaReferenceAgentsImpl(Agents):
 
     async def shutdown(self) -> None:
         pass
+
+    async def list_agents(self) -> ListAgentsResponse:
+        pass
+
+    async def get_agent(self, agent_id: str) -> Agent:
+        pass
+
+    async def list_agent_sessions(
+        self,
+        agent_id: str,
+    ) -> ListAgentSessionsResponse:
+        pass