From 4fb583b4076e245cbd6c9c76546d485652f78563 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 25 Apr 2025 12:23:33 -0700
Subject: [PATCH] fix: check that llama stack client plain can be used as a
 subst for OpenAI client (#2032)

With https://github.com/meta-llama/llama-stack-client-python/pull/226,
now we have llama-stack-client be able to used as a substitute for
OpenAI client (duck-typed) so you don't need to change downstream
library code.

<img width="1399" alt="image"
src="https://github.com/user-attachments/assets/abab6bfd-e6ff-4a7d-a965-fd93e3c105d7"
/>
---
 .../inference/test_openai_completion.py       | 32 ++++++++++++-------
 1 file changed, 20 insertions(+), 12 deletions(-)
diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py
index 75b53100c..46ec03d2e 100644
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@@ -75,19 +75,24 @@ def openai_client(client_with_models):
     return OpenAI(base_url=base_url, api_key="bar")
 
 
+@pytest.fixture(params=["openai_client", "llama_stack_client"])
+def compat_client(request):
+    return request.getfixturevalue(request.param)
+
+
 @pytest.mark.parametrize(
     "test_case",
     [
         "inference:completion:sanity",
     ],
 )
-def test_openai_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
+def test_openai_completion_non_streaming(llama_stack_client, client_with_models, text_model_id, test_case):
     skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
     tc = TestCase(test_case)
 
     # ollama needs more verbose prompting for some reason here...
     prompt = "Respond to this question and explain your answer. " + tc["content"]
-    response = openai_client.completions.create(
+    response = llama_stack_client.completions.create(
         model=text_model_id,
         prompt=prompt,
         stream=False,
@@ -103,13 +108,13 @@ def test_openai_completion_non_streaming(openai_client, client_with_models, text
         "inference:completion:sanity",
     ],
 )
-def test_openai_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
+def test_openai_completion_streaming(llama_stack_client, client_with_models, text_model_id, test_case):
     skip_if_model_doesnt_support_openai_completion(client_with_models, text_model_id)
     tc = TestCase(test_case)
 
     # ollama needs more verbose prompting for some reason here...
     prompt = "Respond to this question and explain your answer. " + tc["content"]
-    response = openai_client.completions.create(
+    response = llama_stack_client.completions.create(
         model=text_model_id,
         prompt=prompt,
         stream=True,
@@ -127,11 +132,11 @@ def test_openai_completion_streaming(openai_client, client_with_models, text_mod
         0,
     ],
 )
-def test_openai_completion_prompt_logprobs(openai_client, client_with_models, text_model_id, prompt_logprobs):
+def test_openai_completion_prompt_logprobs(llama_stack_client, client_with_models, text_model_id, prompt_logprobs):
     skip_if_provider_isnt_vllm(client_with_models, text_model_id)
 
     prompt = "Hello, world!"
-    response = openai_client.completions.create(
+    response = llama_stack_client.completions.create(
         model=text_model_id,
         prompt=prompt,
         stream=False,
@@ -144,11 +149,11 @@ def test_openai_completion_prompt_logprobs(openai_client, client_with_models, te
     assert len(choice.prompt_logprobs) > 0
 
 
-def test_openai_completion_guided_choice(openai_client, client_with_models, text_model_id):
+def test_openai_completion_guided_choice(llama_stack_client, client_with_models, text_model_id):
     skip_if_provider_isnt_vllm(client_with_models, text_model_id)
 
     prompt = "I am feeling really sad today."
-    response = openai_client.completions.create(
+    response = llama_stack_client.completions.create(
         model=text_model_id,
         prompt=prompt,
         stream=False,
@@ -161,6 +166,9 @@ def test_openai_completion_guided_choice(openai_client, client_with_models, text
     assert choice.text in ["joy", "sadness"]
 
 
+# Run the chat-completion tests with both the OpenAI client and the LlamaStack client
+
+
 @pytest.mark.parametrize(
     "test_case",
     [
@@ -168,13 +176,13 @@ def test_openai_completion_guided_choice(openai_client, client_with_models, text
         "inference:chat_completion:non_streaming_02",
     ],
 )
-def test_openai_chat_completion_non_streaming(openai_client, client_with_models, text_model_id, test_case):
+def test_openai_chat_completion_non_streaming(compat_client, client_with_models, text_model_id, test_case):
     skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
     tc = TestCase(test_case)
     question = tc["question"]
     expected = tc["expected"]
 
-    response = openai_client.chat.completions.create(
+    response = compat_client.chat.completions.create(
         model=text_model_id,
         messages=[
             {
@@ -196,13 +204,13 @@ def test_openai_chat_completion_non_streaming(openai_client, client_with_models,
         "inference:chat_completion:streaming_02",
     ],
 )
-def test_openai_chat_completion_streaming(openai_client, client_with_models, text_model_id, test_case):
+def test_openai_chat_completion_streaming(compat_client, client_with_models, text_model_id, test_case):
     skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
     tc = TestCase(test_case)
     question = tc["question"]
     expected = tc["expected"]
 
-    response = openai_client.chat.completions.create(
+    response = compat_client.chat.completions.create(
         model=text_model_id,
         messages=[{"role": "user", "content": question}],
         stream=True,