Make all methods async def again; add completion() for meta-reference (#270)

PR #201 had made several changes while trying to fix issues with getting the stream=False branches of inference and agents API working. As part of this, it made a change which was slightly gratuitous. Namely, making chat_completion() and brethren "def" instead of "async def". The rationale was that this allowed the user (within llama-stack) of this to use it as: ``` async for chunk in api.chat_completion(params) ``` However, it causes unnecessary confusion for several folks. Given that clients (e.g., llama-stack-apps) anyway use the SDK methods (which are completely isolated) this choice was not ideal. Let's revert back so the call now looks like: ``` async for chunk in await api.chat_completion(params) ``` Bonus: Added a completion() implementation for the meta-reference provider. Technically should have been another PR :)
2025-06-28 02:53:30 +00:00 · 2024-10-18 20:50:59 -07:00 · 2024-10-18 20:50:59 -07:00 · 2089427d60
commit 2089427d60
parent 95a96afe34
23 changed files with 330 additions and 213 deletions
--- a/llama_stack/providers/tests/inference/test_inference.py
+++ b/llama_stack/providers/tests/inference/test_inference.py
@ -126,6 +126,45 @@ async def test_model_list(inference_settings):
    assert model_def.identifier == params["model"]


+@pytest.mark.asyncio
+async def test_completion(inference_settings):
+    inference_impl = inference_settings["impl"]
+    params = inference_settings["common_params"]
+
+    provider = inference_impl.routing_table.get_provider_impl(params["model"])
+    if provider.__provider_id__ != "meta-reference":
+        pytest.skip("Other inference providers don't support completion() yet")
+
+    response = await inference_impl.completion(
+        content="Roses are red,",
+        stream=False,
+        model=params["model"],
+        sampling_params=SamplingParams(
+            max_tokens=50,
+        ),
+    )
+
+    assert isinstance(response, CompletionResponse)
+    assert "violets are blue" in response.content
+
+    chunks = [
+        r
+        async for r in await inference_impl.completion(
+            content="Roses are red,",
+            stream=True,
+            model=params["model"],
+            sampling_params=SamplingParams(
+                max_tokens=50,
+            ),
+        )
+    ]
+
+    assert all(isinstance(chunk, CompletionResponseStreamChunk) for chunk in chunks)
+    assert len(chunks) == 51
+    last = chunks[-1]
+    assert last.stop_reason == StopReason.out_of_tokens
+
+
@pytest.mark.asyncio
 async def test_chat_completion_non_streaming(inference_settings, sample_messages):
    inference_impl = inference_settings["impl"]
@ -146,7 +185,7 @@ async def test_chat_completion_streaming(inference_settings, sample_messages):
    inference_impl = inference_settings["impl"]
    response = [
        r
-        async for r in inference_impl.chat_completion(
+        async for r in await inference_impl.chat_completion(
            messages=sample_messages,
            stream=True,
            **inference_settings["common_params"],
@ -217,7 +256,7 @@ async def test_chat_completion_with_tool_calling_streaming(

    response = [
        r
-        async for r in inference_impl.chat_completion(
+        async for r in await inference_impl.chat_completion(
            messages=messages,
            tools=[sample_tool_definition],
            stream=True,