chore(misc): make tests and starter faster (#3042)

A bunch of miscellaneous cleanup focusing on tests, but ended up speeding up starter distro substantially. - Pulled llama stack client init for tests into `pytest_sessionstart` so it does not clobber output - Profiling of that told me where we were doing lots of heavy imports for starter, so lazied them - starter now starts 20seconds+ faster on my Mac - A few other smallish refactors for `compat_client`
2025-12-03 18:00:36 +00:00 · 2025-08-05 14:55:05 -07:00 · 2025-08-05 14:55:05 -07:00 · 7f834339ba
commit 7f834339ba
parent e12524af85
45 changed files with 2897 additions and 1688 deletions
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@ -82,8 +82,7 @@ def wait_for_server_ready(base_url: str, timeout: int = 30, process: subprocess.
    return False


-@pytest.fixture(scope="session")
-def provider_data():
+def get_provider_data():
    # TODO: this needs to be generalized so each provider can have a sample provider data just
    # like sample run config on which we can do replace_env_vars()
    keymap = {
@ -178,8 +177,14 @@ def skip_if_no_model(request):


@pytest.fixture(scope="session")
-def llama_stack_client(request, provider_data):
-    config = request.config.getoption("--stack-config")
+def llama_stack_client(request):
+    client = request.session._llama_stack_client
+    assert client is not None, "llama_stack_client not found in session cache"
+    return client
+
+
+def instantiate_llama_stack_client(session):
+    config = session.config.getoption("--stack-config")
    if not config:
        config = get_env_or_fail("LLAMA_STACK_CONFIG")

@ -212,13 +217,13 @@ def llama_stack_client(request, provider_data):
            print(f"Server is ready at {base_url}")

            # Store process for potential cleanup (pytest will handle termination at session end)
-            request.session._llama_stack_server_process = server_process
+            session._llama_stack_server_process = server_process
        else:
            print(f"Port {port} is already in use, assuming server is already running...")

        return LlamaStackClient(
            base_url=base_url,
-            provider_data=provider_data,
+            provider_data=get_provider_data(),
            timeout=int(os.environ.get("LLAMA_STACK_CLIENT_TIMEOUT", "30")),
        )

@ -228,7 +233,7 @@ def llama_stack_client(request, provider_data):
        if parsed_url.scheme and parsed_url.netloc:
            return LlamaStackClient(
                base_url=config,
-                provider_data=provider_data,
+                provider_data=get_provider_data(),
            )
    except Exception:
        # If URL parsing fails, treat as non-URL config
@ -243,7 +248,7 @@ def llama_stack_client(request, provider_data):

    client = LlamaStackAsLibraryClient(
        config,
-        provider_data=provider_data,
+        provider_data=get_provider_data(),
        skip_logger_removal=True,
    )
    if not client.initialize():
@ -258,8 +263,17 @@ def openai_client(client_with_models):
    return OpenAI(base_url=base_url, api_key="fake")


-@pytest.fixture(params=["openai_client", "llama_stack_client"])
-def compat_client(request):
+@pytest.fixture(params=["openai_client", "client_with_models"])
+def compat_client(request, client_with_models):
+    if isinstance(client_with_models, LlamaStackAsLibraryClient):
+        # OpenAI client expects a server, so unless we also rewrite OpenAI client's requests
+        # to go via the Stack library client (which itself rewrites requests to be served inline),
+        # we cannot do this.
+        #
+        # This means when we are using Stack as a library, we will test only via the Llama Stack client.
+        # When we are using a server setup, we can exercise both OpenAI and Llama Stack clients.
+        pytest.skip("(OpenAI) Compat client cannot be used with Stack library client")
+
    return request.getfixturevalue(request.param)